I have a csv with URLs which contain data I need to extract.
Sometimes, the URL contains none or multiple results, if that is the case, I want to append a None
to the list.
This is the code:
import os
import glob
import time
from urllib.request import urlopen
from numpy import full
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb21.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
clean_aut = []
title = []
isbn_clean = []
for row in csvfile['URL']:
#print('row: ' row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name and title
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
count =1
This is the list file with the URLs: Pastebin
how would I do that?
CodePudding user response:
You should clean your code before posting it. It would be best to stick to a minimal reproducible example. In your case you can remove the outer loop so that we can focus on the part causing problem.
About your code: you should set variables that are not modified during the loop before the loop. Also, checking attribute can be done with any
which makes your code lighter. In order to add None
to your list when no author is found, you can use a boolean:
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
clean_aut = []
for row in csvfile['URL']:
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
no_aut = True
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if any(datafield_node.get(k) != v for attr_dict in datafield_attribute_filters for k,v in attr_dict.items()):
continue
for subfield_node in datafield_node.iterfind("./subfield[@code='a']", namespaces=namespaces):
clean_aut.append(subfield_node.text) #this gets the author name
no_aut = False
if no_aut: clean_aut.append(None)
origdata=pd.DataFrame({'Author':clean_aut})
print(clean_aut)
print(origdata)
Output:
[None, 'Bergren, Lisa Tawn', 'Rahlwes, Ann-Kathrin', 'Ortner, Helmut', 'Ladwig-Winters, Simone', 'Huonker, Thomas', 'Ritter, Karl-Markus', 'Kerkeling, Hape', 'Rohls, Jan', 'Rohls, Jan', 'Rohls, Jan', 'James, Bethan', None, 'Schmidt, Horst']
Author
0 None
1 Bergren, Lisa Tawn
2 Rahlwes, Ann-Kathrin
3 Ortner, Helmut
4 Ladwig-Winters, Simone
5 Huonker, Thomas
6 Ritter, Karl-Markus
7 Kerkeling, Hape
8 Rohls, Jan
9 Rohls, Jan
10 Rohls, Jan
11 James, Bethan
12 None
13 Schmidt, Horst