I have a pretty long python process that is meant to reprocess a massive amount of data from one table, extract the necessary pieces, assign appropriate values and reenter it into another table. The process works great! Except that it is incredibly time intensive. I wonder if there is a way to modify this using an executemany approach? These are snippets of the entire process but delineate where I'm hoping to modify to an executemany
The following code runs through a series of if/then statements to assign the appropriate values to the retrieved information.
# Current crawl begin date
cur.execute('SELECT Crawl_Begin_Date FROM Crawl WHERE Crawl_ID=?', current_crawl_ID)
current_crawl_begin_date = cur.fetchone()
current_crawl_begin_date = current_crawl_begin_date[0]
# Current crawl end date
cur.execute('SELECT Crawl_End_Date FROM Crawl WHERE Crawl_ID=?', current_crawl_ID)
current_crawl_end_date = cur.fetchone()
current_crawl_end_date = current_crawl_end_date[0]
# URL_Crawl table where Crawl_ID == current crawl
sql = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID=%s'
current_crawl = pd.read_sql_query(sql %(current_crawl_ID), con=db)
# num keeps track of number of rows read through from current_crawl (number of nodes in current crawl)
num = 1
# For every unique url in the current crawl
for row in current_crawl.itertuples():
# Calculate max date .......................................................
if ((row.Last_Updated == None) | (row.Last_Updated == '')):
last_updated = '0'
else:
last_updated = row.Last_Updated
if ((row.Last_Published == None) | (row.Last_Published == '')):
last_published = '0'
else:
last_published = row.Last_Published
if ((row.Date_of_HTML == None) | (row.Date_of_HTML == '')):
date_of_html = '0'
else:
date_of_html = row.Date_of_HTML
if ((last_updated >= last_published) & (last_updated >= date_of_html)):
max_date = last_updated
elif ((last_published >= last_updated) & (last_published >= date_of_html)):
max_date = last_published
elif ((date_of_html >= last_updated) & (date_of_html >= last_published)):
max_date = date_of_html
# ..........................................................................
# Set remaining variables from current_crawl dateframe
url_crawl_id = row.URL_Crawl_ID
unique_url_id = row.Unique_URL_ID
# Initialize starting and end dates/statuses with None
starting_date = None
starting_date_status = None
ending_date = None
ending_date_status = None
# URL_Crawl table up until (but not including) current crawl
sql2 = 'SELECT URL_Crawl_ID, Last_Updated, Last_Published, Date_of_HTML, Unique_URL_ID FROM URL_Crawl WHERE Crawl_ID<%s'
previous_crawls = pd.read_sql_query(sql2 %(current_crawl_ID), con=db)
# If row's unique_url_id exists in previous crawls (not a new node)
if (unique_url_id in (previous_crawls['Unique_URL_ID']).tolist()):
# Situation B ...................................................
# Finding most recent lifetime of existing node
existing = previous_crawls[previous_crawls['Unique_URL_ID'] == unique_url_id]
existing_url_crawl_ids = (existing.URL_Crawl_ID).tolist()
existing_in_lifetime = pd.DataFrame()
for i in existing_url_crawl_ids:
sql3 = 'SELECT * FROM Lifetime WHERE URL_Crawl_ID=%d'
exist_in_lt = pd.read_sql_query(sql3 %(i), con=db)
existing_in_lifetime = existing_in_lifetime.append(exist_in_lt, ignore_index=True)
most_recent_lifetime = existing_in_lifetime[existing_in_lifetime.Lifetime_ID == existing_in_lifetime.Lifetime_ID.max()]
# Dates/statuses from most recent lifetime - convert to Strings
most_recent_starting_date = ((most_recent_lifetime.Starting_Date).tolist())[0]
most_recent_starting_date_status = ((most_recent_lifetime.Starting_Date_Status).tolist())[0]
most_recent_ending_date = ((most_recent_lifetime.Ending_Date).tolist())[0]
most_recent_ending_date_status = ((most_recent_lifetime.Ending_Date_Status).tolist())[0]
most_recent_lifetimeID = ((most_recent_lifetime.Lifetime_ID).tolist())[0]
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation B.2
ending_date = max_date
ending_date_status = "Exact"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = None
ending_date_status = None
cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)):
# Situation B.3
ending_date = max_date
ending_date_status = "Exact"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif (max_date == most_recent_starting_date):
# Situation B.4
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
elif ((max_date > current_crawl_end_date) | (max_date < most_recent_starting_date)):
# Situation B.1
max_date = '0'
if (max_date == '0'):
# Situation B.5
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
cur.execute("""UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=?
WHERE Lifetime_ID=?""", (ending_date, ending_date_status, most_recent_lifetimeID))
# If row's unique_url_id is a new node (not seen in previous crawls)
else:
# Situation A ...................................................
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation A.2
starting_date = max_date
starting_date_status = "Exact"
elif (max_date < current_crawl_begin_date):
# Situation A.3
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
elif (max_date > current_crawl_end_date):
# Situation A.1
max_date = '0'
if (max_date == '0'):
# Situation A.4
starting_date = current_crawl_end_date
starting_date_status = "Estimated"
cur.execute("""INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)
""", (starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
Can executemany be used in this capacity? If so, I don't know the appropriate syntax for executemany - I've tried a couple of things that haven't worked yet. The database is SQLite and the program is python based.
CodePudding user response:
Kind of hard to give an exact answer without fully understanding your code. I don't quite see where you are iterating over urls/ids/etc. You will want to make a list for updates and a list for inserts outside your loop and then accumulate the sequences of params in their corresponding list. Finally after the loop you will pass each list to executemany with the fixed SQL you want to execute.
This should give you an idea of how it would work with a loop/iteration.
#...
# These are each a list of tuples/lists
# ie. [(param0, ..., paramN), ..., (param0, ..., paramN)]
params_to_update = []
params_to_insert = []
# For every unique url in the current crawl
for row in current_crawl.itertuples():
#...
if (max_date != '0'):
if ((max_date >= current_crawl_begin_date) & (max_date <= current_crawl_end_date)):
# Situation B.2
ending_date = max_date
ending_date_status = "Exact"
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = None
ending_date_status = None
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
elif ((max_date < current_crawl_begin_date) & (max_date > most_recent_starting_date)):
# Situation B.3
ending_date = max_date
ending_date_status = "Exact"
params_to_update.append((ending_date, ending_date_status, most_recent_lifetimeID))
starting_date = max_date
starting_date_status = "Exact"
ending_date = current_crawl_begin_date
ending_date_status = "Estimated"
params_to_insert.append((starting_date, ending_date, starting_date_status, ending_date_status, url_crawl_id))
# After for loop is done.
# Call UPDATE for each sequence of params in this list.
UPDATE_SQL = """UPDATE Lifetime SET Ending_Date=?, Ending_Date_Status=? WHERE Lifetime_ID=?"""
cur.executemany(UPDATE_SQL, params_to_update)
# Call INSERT for each sequence of params in this list.
INSERT_SQL = """INSERT INTO Lifetime VALUES (null, ?, ?, ?, ?, ?)"""
cur.executemany(INSERT_SQL, params_to_insert)
stackoverflow.com: using-executemany-to-update-entries-in-an-existing-sqlite3-database-using-pyt
docs.python.org: python doc executemany example