BASE_DIR = "C:\\Users\\blah\\Desktop\\blah\\blah\\"
url = "https://otexa.trade.gov/scripts/tqmon2.exe/catdata"
url2 = "https://otexa.trade.gov/scripts/tqmon2.exe/htsdata"
#Define header information for POST call to url
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"
cats = ['340','341']
#cats = ['340','341','640','641','347','348','647','648','338','339','638','639','345','645','445','634','635','352','652','349','649','1','31','61']
cats2 = ['6203424511','6204628011','6215100040','6215200000','4202221500']
dictionary_of_different_sites_to_scrape = {'catdata': {
'url': "https://otexa.trade.gov/scripts/tqmon2.exe/catdata",
'cat': '340',
'delimit': 'ssinvert',
'years': ['2020','2021']
},
'htsdata': {
'url': "https://otexa.trade.gov/scripts/tqmon2.exe/htsdata",
'cat': '6203424511',
'delimit': 'ss',
'years': ['2020', '2021']
}
}
def post_request(param_info):
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"
resp = requests.post(url, data=param_info, headers=headers) # Send the POST request
#print(resp.text) #Test the function's output
return resp.text
def url_stripper(text):
extractor = URLExtract() #Initialize a URL extractor
urls = extractor.find_urls(text) #Get that URL!
link = urls[0]
#print(link)
return link
def filename_splitter(link):
if link.find('/'):
filename = link.rsplit('/', 1) # Get filename from URL
#print(filename[1])
return filename[1]
def save_xls_downloads(filename, link):
# Sends GET request to url for xls download
r = requests.get(link, allow_redirects=True)
# Writes the content of the xls to the local drive
file = open(filename, 'wb')
file.write(r.content)
file.close()
print(filename, "written to folder.")
return filename
def fix_downloaded_xls(filename, code):
#This section is required due to Otexa having codec issues in the xls files
excel = win32.gencache.EnsureDispatch(
'Excel.Application') # Initializes an instance of Excel
# Tell excel to open the file
wb = excel.Workbooks.Open(BASE_DIR filename)
wb.SaveAs("C:\\Users\\blah\\Desktop\\blah\\blah\\"
"category - " code ".xlsx", FileFormat=51) # Changes the file from xls to xlsx
wb.Close() # Closes the workbook
excel.Application.Quit() # Closes the instance of Excel
return print("Converted", filename, "to category -",code,".xlsx")
def delete_old_xls(filename):
#Remove old xls file
remove_old_file = pathlib.Path(BASE_DIR filename)
remove_old_file.unlink()
return print("Deleted", filename)
def clean_new_xlsx(code):
# Takes new xlsx file, loads as dataframe, skips the first 4 rows, and assigns the new first row as header
if len(code) > 3:
rows_to_skip = 5
else:
rows_to_skip = 4
output = pd.read_excel("category - " code ".xlsx", skiprows=rows_to_skip, header=0)
# Tons of NAN values, so replacing with zeroes
output.fillna(0, inplace=True)
#print(output.head(2))
#final_table = final_table.append(output)
print("Category -", code, ".xlsx has been cleaned and made into dataframe")
return output
def append_to_report_table(cleaned_dataframe, report_table):
#print(cleaned_dataframe.head())
report = report_table.append(cleaned_dataframe, ignore_index=True)
report_table = report
#report_table = report_table.rename(columns=report_table.iloc[0])
#print(report_table.shape)
#print(report_table.head())
print("DataFrame appended to report_table.")
print(report_table.shape)
return report_table
def save_report_table(site_id, report):
print(report.shape)
filename = "C:\\Users\\blah\\Desktop\\blah\\blah\\complete report.xlsx"
#print(report.info())
#print(report.head())
# Drop any rows that the country is equal to 0, which means they were BLANK
report.drop(report[report['Country'] == 0].index, inplace=True)
report = report.reset_index() # Resets the index to get a true count
report.to_excel(filename)
print("End")
return print("Saved complete", site_id, "report.")
for site_id, param_info in dictionary_of_different_sites_to_scrape.items():
if len(param_info['cat']) > 3:
hts_report = pd.DataFrame()
for cat in cats2:
param_info['cat'] = cat
text = post_request(param_info)
link = url_stripper(text)
filename = filename_splitter(link)
save_xls = save_xls_downloads(filename, link)
fix_downloaded_xls(save_xls, param_info['cat'])
delete_old_xls(filename)
clean_xlsx = clean_new_xlsx(param_info['cat'])
append_report = append_to_report_table(clean_xlsx, hts_report)
save_report_table(site_id, report_table)
else:
report_table = pd.DataFrame()
for cat in cats:
param_info['cat'] = cat
post_request(param_info)
text = post_request(param_info)
url_stripper(text)
link = url_stripper(text)
filename = filename_splitter(link)
save_xls = save_xls_downloads(filename, link)
fix_downloaded_xls(save_xls, param_info['cat'])
delete_old_xls(filename)
clean_xlsx = clean_new_xlsx(param_info['cat'])
append_to_report_table(clean_xlsx, report_table)
save_report_table(site_id, report_table)
The code above is meant to scrape otexa for hts code data at various levels. The steps that are meant to occur are:
- post request
- strip the url
- split the filename from the url
- save the xls file (these are always corrupted and I have to do this and the next to fix)
- fix the xls file
- delete the old xls file
- clean the new xlsx file and load it into dataframe
- append dataframe to master dataframe
- save master dataframe to location
Where I begin to run into problems is that after successfully cleaning the file and loading it to a dataframe and returning that dataframe (shape shows data exists in returned object) this object is passed to the next function to have the dataframe appended to another dataframe. Both objects, the master table and the newly cleaned dataframe are passed to the append_to_report_table, but when checking them at the start of the function, both will be empty.
I have given this almost a week's worth of research and effort and do not intend to fail. I am open to all advice, suggestions, solutions, and critiques in the name of doing better work.
Long time reader, infant poster.
CodePudding user response:
The answer to your question is pretty simple.
If you have two datasets, "A" and "B" you can append "A" to "B" or "B" to "A" by using this code:
dataset = [A, B]
result = pd.concat(dataset)
To see if the dataset increased in size, you can simply do:
result.shape
This solution does require the Pandas Library and Numpy Library