Home > Net >  How to separate data per column when writing data to excel from web scraping results
How to separate data per column when writing data to excel from web scraping results

Time:09-07

I know how to separate it when the data looks like:

x, y, z

But I can't figure out how to do it when the data format is like:

Doe, John, BookName, Year, abstract with commas, links. 

This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }

datas = []
count_page = 0
for page in range(1,2): 
    count_page =1
    print('Scraping Offset No:', count_page)
    result = requests.get(url str(page), verify=False) 
    
    soup = BeautifulSoup(result.text, 'html.parser')
    items = soup.find_all('li','ds-artifact-item')
    for it in items:
        author = it.find('span','author h4').text
        title = ''.join(it.find('a',href=True).text.strip().split('\n'))
        year = it.find('span','date').text
        abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('\n'))
        link = it.find('a')['href']
        datas.append([author, title, year, abstract, link])

kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)

CodePudding user response:

This is my suggestion. I will need to know an offset to be able to test it.

A CSV separated by semi-colons will be far easier to separate in Excel.

from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


fakdep = '165'  
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
    }

datas = []
count_page = 0
for page in range(1,2): 
    count_page =1
    print('Scraping Offset No:', count_page)
    result = requests.get(url str(page), verify=False) 
    
    soup = BeautifulSoup(result.text, 'html.parser')
    items = soup.find_all('li','ds-artifact-item')
    for it in items:
        author = it.find('span','author h4').text
        title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
        year = it.find('span','date').text
        abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
        link = it.find('a')['href']
        datas.append([author, title, year, abstract, link])

kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
  • Related