I am trying to adapt a Python 2.7 routine for creating SQLite db dictionaries for an e-reader from TSV files.
The routine shown creates a (pk)zip file of an html string in lines 71-74 (# compress and save). I'd like the compressed file to be in gzip format instead. I'm really, really shaky on Python and I have been looking at examples of using gzip compression, trying to fit them into this routine (which I have already adapted in other ways for my own use from the original). I know that changes need to be made in line 9 (import) also, but am not sure which.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# by Jiri Orsag, 2014
# https://github.com/geoRG77/nook-dictionary
# Many thanks to Homeless Ghost for his script 'createRenateNSTdictionaryfromnookdictionarydb.py'
# which was a great source of ideas for my work
import sqlite3, sys, zipfile, zlib, os
# config
DICTIONARY_FILE = 'test.txt' # input file (needed)
OUTPUT_DB = 'ox_en_GB.db' # output file
TEMP_DIRECTORY = './temp/' # will be deleted after successful run
STEP = 10000 # for print message
########################################################
print 'Converting dictionary...'
con = sqlite3.connect(OUTPUT_DB)
con.text_factory = str
cur = con.cursor()
index = 0
duplicateCount = 1
prevTerm = ''
try:
if not os.path.exists(TEMP_DIRECTORY):
os.makedirs(TEMP_DIRECTORY)
# open dict file
dict = open(DICTIONARY_FILE, 'r')
# delete previous tables
cur.execute('DROP TABLE IF EXISTS android_metadata')
cur.execute('DROP TABLE IF EXISTS tblWords')
# create tables
cur.execute('CREATE TABLE "android_metadata"("locale" TEXT)')
cur.execute('CREATE TABLE "tblWords"("_id" INTEGER PRIMARY KEY AUTOINCREMENT, "term" TEXT COLLATE nocase, "description" BLOB)')
# convert dict to sql
for line in dict:
index = 1
# uncomment next line to debug
# print '# current line = %d' % index
# split line
data = line.split('\t')
term = data.pop(0)
# create HTML
html = '<b>' term '</b>' data[0].strip()
# check for duplicates
if term == prevTerm:
duplicateCount = 1
termEdited = term '[' str(duplicateCount) ']'
else:
termEdited = term
duplicateCount = 1
# create html file
term_stripped = termEdited.replace('/', '')
temp_html = open(TEMP_DIRECTORY term_stripped, 'wb')
temp_html.write(html)
temp_html.close()
# compress & save
zf = zipfile.ZipFile('_temp', mode='w')
zf.write(TEMP_DIRECTORY term_stripped)
zf.close()
# read & insert compressed data
temp_compressed = open('_temp', 'rb')
compressed = temp_compressed.read()
cur.execute('INSERT INTO tblWords (_id, term, description) VALUES(?, ?, ?)', (index, termEdited, sqlite3.Binary(compressed)))
# if duplicate then update previous row with [1]
if duplicateCount == 2:
cur.execute('UPDATE tblWords SET term="' str(term "[1]") '" WHERE _id=' str(index - 1) '')
os.remove(TEMP_DIRECTORY term_stripped)
prevTerm = term
# print _id, term, description
if ((index % STEP) == 0):
print '# current line = %d' % index
#if index == 100:
# break;
# create term_index
cur.execute('CREATE INDEX term_index on tblWords (term ASC)')
cur.execute('SELECT * FROM tblWords order by _id LIMIT 10')
dict.close
# os.remove('_temp')
# os.rmdir(TEMP_DIRECTORY)
except Exception, e:
raise
else:
pass
finally:
pass
print 'Done. ' str(index) ' lines converted.'
Any hints or good references?
CodePudding user response:
import gzip
and replace zipfile.ZipFile
with gzip.open
. Write to the gzip file what you wrote to the temporary file, which is no longer necessary. E.g.
gz = gzip.open('_temp', mode='wb')
gz.write(html)
gz.close()
and get rid of the temp_html
lines and the os.remove()
of it. I recommend the b
in the mode, which was missing for the zip file for some reason.