I have been trying to extract the text from some Wikipedia dumps. I need to get the text from the id, title, ns, timestamp, username, ip, and text tags in the full-history English Wikipedia dump.
I read and modified the code from https://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html.
I was able to write the code below:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os
PATH_WIKI_XML = '/home/wikipedia'
FILENAME_WIKI = 'enwiki-latest-pages-meta-history1.xml-p24706p25444'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8"
def hms_string(sec_elapsed):
h = int(sec_elapsed / (60 * 60))
m = int((sec_elapsed % (60 * 60)) / 60)
s = sec_elapsed % 60
return "{}:{:>02}:{:>05.2f}".format(h, m, s)
def strip_tag_name(t):
t = elem.tag
idx = k = t.rfind("}")
if idx != -1:
t = t[idx 1:]
return t
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)
totalCount = 0
articleCount = 0
templateCount = 0
title = None
timestamp= None
username= None
ip= None
text=None
start_time = time.time()
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
templateWriter.writerow(['id', 'title'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = ''
id = -1
inrevision = False
incontributor= False
ns = 0
elif tname == 'revision':
# Do not pick up on revision id's
inrevision = True
elif tname == 'contributor':
incontributor = True
else:
if tname == 'title':
title = elem.text
elif tname == 'id' and not inrevision and not incontributor:
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = int(elem.text)
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
elif tname == 'page':
totalCount = 1
if ns == 10:
templateCount = 1
templateWriter.writerow([id, title])
elif len(title) > 0:
articleCount = 1
articlesWriter.writerow(['id', 'title', 'timestamp','username','ip','text'])
# if totalCount > 100000:
# break
if totalCount > 1 and (totalCount % 100000) == 0:
print("{:,}".format(totalCount))
elem.clear()
elapsed_time = time.time() - start_time
print("Total pages: {:,}".format(totalCount))
print("Template pages: {:,}".format(templateCount))
print("Article pages: {:,}".format(articleCount))
print("Elapsed time: {}".format(hms_string(elapsed_time)))
However, the resulting csv file only had the id and the title. The other columns were empty and I guess its because the timestamp tag is nested within the revision tag but my event only deals with start and end of the page tag. username and IP are also nested in the contributor tag as seen in the sample XML file below. Can someone advise me how I can solve this? Should I loop events within events to extract the text from the desired nested tags? Is there a code that can work for me here? The desired output is a file with the desired tags as headers and the text in rows. Remember, one page can have many revisions and different meta data for each revision. I want to get all the text from the desired tags as well as the metadata within a page and after I have everything from that page, proceed to the next page. Thanks.
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/
http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="sco">
<siteinfo>
<sitename>Wikipedia</sitename>
<dbname>scowiki</dbname>
<base>http://sco.wikipedia.org/wiki/Main_Page</base>
<generator>MediaWiki 1.25wmf12</generator>
<case>first-letter</case>
<namespaces>
<namespace key="-2" case="first-letter">Media</namespace>
<namespace key="-1" case="first-letter">Special</namespace>
<namespace key="0" case="first-letter" />
<namespace key="1" case="first-letter">Talk</namespace>
<namespace key="2" case="first-letter">User</namespace>
<namespace key="3" case="first-letter">User talk</namespace>
<namespace key="4" case="first-letter">Wikipedia</namespace>
<namespace key="5" case="first-letter">Wikipedia talk</namespace>
<namespace key="6" case="first-letter">File</namespace>
<namespace key="7" case="first-letter">File talk</namespace>
<namespace key="8" case="first-letter">MediaWiki</namespace>
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
<namespace key="10" case="first-letter">Template</namespace>
<namespace key="11" case="first-letter">Template talk</namespace>
<namespace key="12" case="first-letter">Help</namespace>
<namespace key="13" case="first-letter">Help talk</namespace>
<namespace key="14" case="first-letter">Category</namespace>
<namespace key="15" case="first-letter">Category talk</namespace>
<namespace key="100" case="first-letter">Portal</namespace>
<namespace key="101" case="first-letter">Portal talk</namespace>
<namespace key="828" case="first-letter">Module</namespace>
<namespace key="829" case="first-letter">Module talk</namespace>
</namespaces>
</siteinfo>
<page>
<title>Inglis leid</title>
<ns>0</ns>
<id>2</id>
<revision>
<id>7</id>
<timestamp>2005-06-22T10:17:05Z</timestamp>
<contributor>
<ip>24.251.198.251</ip>
</contributor>
<model>wikitext</model>
<format>text/x-wiki</format>
<text xml:space="preserve">Tha '''Inglis''' (English) leid is a west [[Gairmanic leid]] at cam frae Ingland an thats forebear wis [[auld Inglis]]. Tha name "English" cams frae tha pairt o [[Gairmanie]] caw'd "Angeln". Inglis is tha waruld's seicont maist widelie spaken first leid, an his aboot 340 million hameborn speikers waruldwide.
[[en:English language]]</text>
<sha1>6m5yxiaalrm6te7e3x3fiw1aq7wk9ir</sha1>
</revision>
</page>
</mediawiki>
CodePudding user response:
Simplify the attempted script to the bare minimum you need such as removing the timings. The process here is using iterparse
usually for very large XML files to iteratively parse tag by tag wherever the tag resides in document so either as root, parent, child, descendant, etc.
Therefore, clean up the logic tag by tag and then on last needed tag, write row to csv with current assigned variables which are reset on every <page>
tag.
pathWikiXML = "Input.xml"
pathWikiCSV = "Output.csv"
def strip_tag_name(t):
return t.split("}")[1] if "}" in t else t
with codecs.open(pathWikiCSV, "w", "utf-8") as f:
cw = csv.writer(f)
cw.writerow(['id', 'title', 'timestamp','username','ip','text'])
for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
tname = strip_tag_name(elem.tag)
if event == 'start':
if tname == 'page':
title = None
timestamp = None
username = None
ip = None
text = None
elif tname == 'title':
title = elem.text
elif tname == 'id':
id = int(elem.text)
elif tname == 'ns':
ns = int(elem.text)
elif tname == 'timestamp':
timestamp = elem.text
elif tname == 'username':
username = elem.text
elif tname == 'ip':
ip = elem.text
elif tname == 'text':
text = elem.text
cw.writerow([id, title, timestamp, username, ip, text])
CSV Output
id | title | timestamp | username | ip | text |
---|---|---|---|---|---|
7 | Inglis leid | 2005-06-22T10:17:05Z | 24.251.198.251 | "Tha '''Inglis''' (English)... |
CodePudding user response:
That sample code you posted is kinda complex. Like @Parfait recommended, try to pare down the problem, and get something basic working, and get a basic understanding how it works.
I think the approach of iterparse
with events is really complex, and confusing to start out.
There's another way to search for elements in XML. It's called XPath and it allows you to search by relationship: "find this element under this element". Once you have an element, all you have to do is call its text
property; no stripping of namespaces. Oh! You do need to account for the namespace though, but that's pretty easy.
#!/usr/bin/env python3
import csv
import xml.etree.ElementTree as ET
# Set up the namespace that needs to be a part of every XPath query
ns_dict = {'xmlns': 'http://www.mediawiki.org/xml/export-0.10/'}
# Open a file and parse it
root = ET.parse('sample.xml')
# Get the page element, as a Python object; you must use 'xmlns' (with a colon) and
# pass `ns_dict`, from earlier, `.//xmlns` means "find, from here (at root), the page element at any depth (with the predefined xmlns namespace)"
page = root.find('.//xmlns:page', ns_dict)
# With the page "element", find its direct children (always using `xmlns:` and passing `ns_dict`)
# `./xmlns:<element>` means "find, from here (at page), the element that's only 1 level (directly) under page"
pg_title = page.find('./xmlns:title', ns_dict)
pg_ns = page.find('./xmlns:ns', ns_dict)
pg_id = page.find('./xmlns:id', ns_dict)
# Get the revision element
revision = page.find('./xmlns:revision', ns_dict)
rev_id = revision.find('./xmlns:id', ns_dict)
rev_ts = revision.find('./xmlns:timestamp', ns_dict)
# Find ip under contributor
contrib_ip = revision.find('./xmlns:contributor/xmlns:ip', ns_dict)
print('page title:', pg_title.text)
print('page id:', pg_id.text)
print('rev id:', rev_id.text)
print('rev timestamp:', rev_ts.text)
print('contributor ip:', contrib_ip.text)
# From here, write out to a CSV
with open('out.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Page title', 'Page id', 'Rev id', 'Rev timestamp', 'Contributor ip'])
writer.writerow([pg_title.text, pg_id.text, rev_id.text, rev_ts.text, contrib_ip.text])