I know how to separate it when the data looks like:
x, y, z
But I can't figure out how to do it when the data format is like:
Doe, John, BookName, Year, abstract with commas, links.
This is what the data looks like in excel after the scrape
This is what i wanted it to looks like
This is my code
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page =1
print('Scraping Offset No:', count_page)
result = requests.get(url str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().split('\n'))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().split('\n'))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''))
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)
CodePudding user response:
This is my suggestion. I will need to know an offset to be able to test it.
A CSV separated by semi-colons will be far easier to separate in Excel.
from unittest import result
import requests
from bs4 import BeautifulSoup
import csv
import urllib3.request
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
fakdep = '165'
offset = input('Please enter number of offset:')
url = 'https://repositori.usu.ac.id/handle/123456789/{}?offset={}'.format(fakdep,offset)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
datas = []
count_page = 0
for page in range(1,2):
count_page =1
print('Scraping Offset No:', count_page)
result = requests.get(url str(page), verify=False)
soup = BeautifulSoup(result.text, 'html.parser')
items = soup.find_all('li','ds-artifact-item')
for it in items:
author = it.find('span','author h4').text
title = ''.join(it.find('a',href=True).text.strip().replace('/n', ''))
year = it.find('span','date').text
abstract = ''.join(it.find('div','artifact-abstract').text.strip().replace('/n', ''))
link = it.find('a')['href']
datas.append([author, title, year, abstract, link])
kepala = ['Author', 'Title', 'Year', 'Abstract', 'Link']
thewriter = csv.writer(open('results/{}_{}.csv'.format(fakdep,offset), 'w', newline=''), delimiter=";")
thewriter.writerow(kepala)
for d in datas: thewriter.writerow(d)