Home > Software engineering >  How do I append a dataframe to another dataframe by passing it to a function, and returning a datafr
How do I append a dataframe to another dataframe by passing it to a function, and returning a datafr

Time:10-05

 BASE_DIR = "C:\\Users\\blah\\Desktop\\blah\\blah\\"




url = "https://otexa.trade.gov/scripts/tqmon2.exe/catdata"
url2 = "https://otexa.trade.gov/scripts/tqmon2.exe/htsdata"
#Define header information for POST call to url
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"

cats = ['340','341']
#cats = ['340','341','640','641','347','348','647','648','338','339','638','639','345','645','445','634','635','352','652','349','649','1','31','61']

cats2 = ['6203424511','6204628011','6215100040','6215200000','4202221500']


dictionary_of_different_sites_to_scrape = {'catdata': {
    'url': "https://otexa.trade.gov/scripts/tqmon2.exe/catdata",
    'cat': '340',
    'delimit': 'ssinvert',
    'years': ['2020','2021']
    },
    'htsdata': {
    'url': "https://otexa.trade.gov/scripts/tqmon2.exe/htsdata",
    'cat': '6203424511',
    'delimit': 'ss',
    'years': ['2020', '2021']
}
}



def post_request(param_info): 
    headers = CaseInsensitiveDict()
    headers["Accept"] = "application/json"
    headers["Content-Type"] = "application/json"
    resp = requests.post(url, data=param_info, headers=headers)  # Send the POST request
    #print(resp.text) #Test the function's output
    return resp.text

def url_stripper(text):
    extractor = URLExtract() #Initialize a URL extractor
    urls = extractor.find_urls(text) #Get that URL!
    link = urls[0]
    #print(link)
    return link

def filename_splitter(link):
    if link.find('/'):
        filename = link.rsplit('/', 1)  # Get filename from URL
        #print(filename[1])
    return filename[1]
    
def save_xls_downloads(filename, link):
    # Sends GET request to url for xls download
    r = requests.get(link, allow_redirects=True)
    # Writes the content of the xls to the local drive
    file = open(filename, 'wb')
    file.write(r.content)
    file.close()
    print(filename, "written to folder.")
    return filename

def fix_downloaded_xls(filename, code):
    #This section is required due to Otexa having codec issues in the xls files
    excel = win32.gencache.EnsureDispatch(
        'Excel.Application')  # Initializes an instance of Excel
    # Tell excel to open the file
    wb = excel.Workbooks.Open(BASE_DIR   filename)

    wb.SaveAs("C:\\Users\\blah\\Desktop\\blah\\blah\\"  
              "category - "   code   ".xlsx", FileFormat=51)  # Changes the file from xls to xlsx

    wb.Close()  # Closes the workbook
    excel.Application.Quit()  # Closes the instance of Excel
    return print("Converted", filename, "to category -",code,".xlsx")

def delete_old_xls(filename):
    #Remove old xls file
    remove_old_file = pathlib.Path(BASE_DIR   filename)
    remove_old_file.unlink()
    return print("Deleted", filename)

def clean_new_xlsx(code):
    # Takes new xlsx file, loads as dataframe, skips the first 4 rows, and assigns the new first row as header
    if len(code) > 3:
        rows_to_skip = 5
    else:
        rows_to_skip = 4
    output = pd.read_excel("category - "   code   ".xlsx", skiprows=rows_to_skip, header=0)
    # Tons of NAN values, so replacing with zeroes
    output.fillna(0, inplace=True)
    #print(output.head(2))
    #final_table = final_table.append(output)
    print("Category -", code, ".xlsx has been cleaned and made into dataframe")
    return output

def append_to_report_table(cleaned_dataframe, report_table):
    #print(cleaned_dataframe.head())
    report = report_table.append(cleaned_dataframe, ignore_index=True)
    report_table = report
    #report_table = report_table.rename(columns=report_table.iloc[0])
    #print(report_table.shape)
    #print(report_table.head())
    print("DataFrame appended to report_table.")
    print(report_table.shape)
    return report_table

def save_report_table(site_id, report):
    print(report.shape)
    filename = "C:\\Users\\blah\\Desktop\\blah\\blah\\complete report.xlsx"
    #print(report.info())
    #print(report.head())
    # Drop any rows that the country is equal to 0, which means they were BLANK
    report.drop(report[report['Country'] == 0].index, inplace=True)
    report = report.reset_index()  # Resets the index to get a true count
    report.to_excel(filename)
    print("End")
    return print("Saved complete", site_id, "report.")

for site_id, param_info in dictionary_of_different_sites_to_scrape.items():

    if len(param_info['cat']) > 3:
        hts_report = pd.DataFrame()
        for cat in cats2:
            param_info['cat'] = cat
            text = post_request(param_info)
            link = url_stripper(text)
            filename = filename_splitter(link)
            save_xls = save_xls_downloads(filename, link)
            fix_downloaded_xls(save_xls, param_info['cat'])
            delete_old_xls(filename)
            clean_xlsx = clean_new_xlsx(param_info['cat'])
            append_report = append_to_report_table(clean_xlsx, hts_report)
        save_report_table(site_id, report_table)
    else:
        report_table = pd.DataFrame()
        for cat in cats:
            param_info['cat'] = cat
            post_request(param_info)
            text = post_request(param_info)
            url_stripper(text)
            link = url_stripper(text)
            filename = filename_splitter(link)
            save_xls = save_xls_downloads(filename, link)
            fix_downloaded_xls(save_xls, param_info['cat'])
            delete_old_xls(filename)
            clean_xlsx = clean_new_xlsx(param_info['cat'])
            append_to_report_table(clean_xlsx, report_table)
        save_report_table(site_id, report_table)
    

The code above is meant to scrape otexa for hts code data at various levels. The steps that are meant to occur are:

  1. post request
  2. strip the url
  3. split the filename from the url
  4. save the xls file (these are always corrupted and I have to do this and the next to fix)
  5. fix the xls file
  6. delete the old xls file
  7. clean the new xlsx file and load it into dataframe
  8. append dataframe to master dataframe
  9. save master dataframe to location

Where I begin to run into problems is that after successfully cleaning the file and loading it to a dataframe and returning that dataframe (shape shows data exists in returned object) this object is passed to the next function to have the dataframe appended to another dataframe. Both objects, the master table and the newly cleaned dataframe are passed to the append_to_report_table, but when checking them at the start of the function, both will be empty.

I have given this almost a week's worth of research and effort and do not intend to fail. I am open to all advice, suggestions, solutions, and critiques in the name of doing better work.

Long time reader, infant poster.

CodePudding user response:

The answer to your question is pretty simple.

If you have two datasets, "A" and "B" you can append "A" to "B" or "B" to "A" by using this code:

dataset = [A, B]
result = pd.concat(dataset)

To see if the dataset increased in size, you can simply do:

result.shape

This solution does require the Pandas Library and Numpy Library

  • Related