Home > OS >  Printing Several CSV files to install into Postgres - getting error converting Float to Integer
Printing Several CSV files to install into Postgres - getting error converting Float to Integer

Time:11-04

note: I'm not a programmer by any means, but I've been asking questions and doing tutorials to try to understand how to get a solution for my issues, outside of therapy.

Anyway, I have a script that downloads .csv files and then imports them into a Postgres database. I noticed that the .csv's were having a lot of "float" data types and I needed those to be integers so I found the following snippet and was using it. I was learning how to use pandas and I'm getting an error on the

df.iloc[index, c] = int(x) line.

it's showing "ValueError: cannot convert float NaN to integer"

I ended up commenting out a few lines in that snippet and it works but it's turning columns into float and adding .0.

in my head, I'm thinking I'm not identifying the right columns but I'm not sure how to tell.

If I comment out everything under the duplicated line and above the return df line.. it runs without error but the data isn't accurate. It's showing float data in fields that should just be integers.

I'm thinking that I need to be identifying every column that needs to be looked at as a float and make it an integer, but not sure how to do that in a script so it looks at anything downloaded and fixes that. Because I'm having the script log into a site and download .csv files and then print them and put them into postgres.

    print("Printing csv present at "   file_path) 
    df = pd.read_csv(file_path, index_col=None, header=0)
    df = df.loc[:,~df.columns.duplicated()]
    for index, row in df.iterrows():
        for c, x in enumerate(row):
            if isinstance(x, float):
                df.iloc[index, c] = int(x)
    return df

I'm learning as I go so excuse if things do not look "textbook" but the more I practice the better I get.

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from subprocess import CREATE_NO_WINDOW
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from datetime import timedelta, datetime
import time
import json
import sys
import csv
import random
import requests
from bs4 import BeautifulSoup
import logging, traceback
from os.path import expanduser
import os,glob
import pandas as pd

# All credentials are housed in the user environment under the designated variable. 

cf_user = os.environ.get('CF_USER') # Windows Variable
cf_password = os.environ.get('CF_PASS') # Windows Variable

home = expanduser("~")
cf_url = 'website.url.com'

#directory where cf files are downloaded
directory = home   "/Downloads/"

def getDriver():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--start-maximized")
    driver = webdriver.Chrome(executable_path=r'C:\chromedriver_win\chromedriver.exe', options=chrome_options)
    return driver

driver = getDriver()

def waitAndGetElement(delay, condition):
    elem = WebDriverWait(driver, delay).until(condition)
    return elem

def bulkDownload(url, file_format):
    driver.get(url)
    downloadBtn = waitAndGetElement(600000, EC.presence_of_element_located((By.XPATH, '//a[text()=" Download Now"]')))
    time.sleep(5)
    downloadBtn.click()
    current_url = url = driver.current_url
    export_num = current_url.split("/")[-1]

    file_path = file_format.format(export_num)
    while not os.path.exists(file_path):
        time.sleep(2)
    
    print("Printing csv present at "   file_path) # This takes the downloaded file and prints it to the csvs folder
    df = pd.read_csv(file_path, index_col=None, header=0)
    df = df.loc[:,~df.columns.duplicated()]
    for index, row in df.iterrows():
        for c, x in enumerate(row):
            if isinstance(x, float):
                df.iloc[index, c] = int(x)
    return df

driver.get("https://"   cf_url   "/users/sign_in")
email_input = waitAndGetElement(300, EC.presence_of_element_located((By.ID, 'user_email')))
email_input.send_keys(cf_user)
pwd_input = waitAndGetElement(300, EC.presence_of_element_located((By.ID, 'user_password')))
pwd_input.send_keys(cf_password)

submit_btn = waitAndGetElement(300, EC.presence_of_element_located((By.XPATH, '//input[@name="commit"]')))
submit_btn.click()

ADDRESS_FILE_NAME_FORMAT = home   "/Downloads/address_export_{}.csv"
ORDER_FILE_NAME_FORMAT = home   "/Downloads/customer_export_{}.csv"
SIGNATURE_FILE_NAME_FORMAT = home   "/Downloads/signature_export_{}.csv"
PRODUCT_FILE_NAME_FORMAT = home   "/Downloads/product_export_{}.csv"

df = bulkDownload("https://"   cf_url   "/path_to_sign/url/site_admin/products/export?model=product", PRODUCT_FILE_NAME_FORMAT)
df.to_csv('csvs/products.csv', index = False)

df = bulkDownload("https://"   cf_url   "/path_to_sign/url/site_admin/signatures/export?filterrific[sorted_by]=signed_at_desc&model=signature", SIGNATURE_FILE_NAME_FORMAT)
df.to_csv('csvs/signatures.csv', index = False)

df = bulkDownload("https://"   cf_url   "/path_to_sign/url/site_admin/orders/export?filterrific[sorted_by]=created_at_desc&model=customer", ORDER_FILE_NAME_FORMAT)
df.to_csv('csvs/customers.csv', index = False)

df = bulkDownload("https://"   cf_url   "/path_to_sign/url/site_admin/addresses/export?model=address", ADDRESS_FILE_NAME_FORMAT)
df.to_csv('csvs/address.csv', index = False)



driver.quit()

CodePudding user response:

You need to ensure that the float does have a value otherwise it cannot be coverted to an integer. Carry out an extra check with if math.isnan(x): and add the import math statement at top of script.

for index, row in df.iterrows():
    for c, x in enumerate(row):
        if isinstance(x, float):
            if math.isnan(x):
                df.iloc[index, c] = 0 # zero or whatever value when NaN is identified
            else:
                df.iloc[index, c] = int(x)
return df
  • Related