note: I'm not a programmer by any means, but I've been asking questions and doing tutorials to try to understand how to get a solution for my issues, outside of therapy.
Anyway, I have a script that downloads .csv files and then imports them into a Postgres database. I noticed that the .csv's were having a lot of "float" data types and I needed those to be integers so I found the following snippet and was using it. I was learning how to use pandas and I'm getting an error on the
df.iloc[index, c] = int(x) line.
it's showing "ValueError: cannot convert float NaN to integer"
I ended up commenting out a few lines in that snippet and it works but it's turning columns into float and adding .0.
in my head, I'm thinking I'm not identifying the right columns but I'm not sure how to tell.
If I comment out everything under the duplicated line and above the return df line.. it runs without error but the data isn't accurate. It's showing float data in fields that should just be integers.
I'm thinking that I need to be identifying every column that needs to be looked at as a float and make it an integer, but not sure how to do that in a script so it looks at anything downloaded and fixes that. Because I'm having the script log into a site and download .csv files and then print them and put them into postgres.
print("Printing csv present at " file_path)
df = pd.read_csv(file_path, index_col=None, header=0)
df = df.loc[:,~df.columns.duplicated()]
for index, row in df.iterrows():
for c, x in enumerate(row):
if isinstance(x, float):
df.iloc[index, c] = int(x)
return df
I'm learning as I go so excuse if things do not look "textbook" but the more I practice the better I get.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from subprocess import CREATE_NO_WINDOW
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from datetime import timedelta, datetime
import time
import json
import sys
import csv
import random
import requests
from bs4 import BeautifulSoup
import logging, traceback
from os.path import expanduser
import os,glob
import pandas as pd
# All credentials are housed in the user environment under the designated variable.
cf_user = os.environ.get('CF_USER') # Windows Variable
cf_password = os.environ.get('CF_PASS') # Windows Variable
home = expanduser("~")
cf_url = 'website.url.com'
#directory where cf files are downloaded
directory = home "/Downloads/"
def getDriver():
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(executable_path=r'C:\chromedriver_win\chromedriver.exe', options=chrome_options)
return driver
driver = getDriver()
def waitAndGetElement(delay, condition):
elem = WebDriverWait(driver, delay).until(condition)
return elem
def bulkDownload(url, file_format):
driver.get(url)
downloadBtn = waitAndGetElement(600000, EC.presence_of_element_located((By.XPATH, '//a[text()=" Download Now"]')))
time.sleep(5)
downloadBtn.click()
current_url = url = driver.current_url
export_num = current_url.split("/")[-1]
file_path = file_format.format(export_num)
while not os.path.exists(file_path):
time.sleep(2)
print("Printing csv present at " file_path) # This takes the downloaded file and prints it to the csvs folder
df = pd.read_csv(file_path, index_col=None, header=0)
df = df.loc[:,~df.columns.duplicated()]
for index, row in df.iterrows():
for c, x in enumerate(row):
if isinstance(x, float):
df.iloc[index, c] = int(x)
return df
driver.get("https://" cf_url "/users/sign_in")
email_input = waitAndGetElement(300, EC.presence_of_element_located((By.ID, 'user_email')))
email_input.send_keys(cf_user)
pwd_input = waitAndGetElement(300, EC.presence_of_element_located((By.ID, 'user_password')))
pwd_input.send_keys(cf_password)
submit_btn = waitAndGetElement(300, EC.presence_of_element_located((By.XPATH, '//input[@name="commit"]')))
submit_btn.click()
ADDRESS_FILE_NAME_FORMAT = home "/Downloads/address_export_{}.csv"
ORDER_FILE_NAME_FORMAT = home "/Downloads/customer_export_{}.csv"
SIGNATURE_FILE_NAME_FORMAT = home "/Downloads/signature_export_{}.csv"
PRODUCT_FILE_NAME_FORMAT = home "/Downloads/product_export_{}.csv"
df = bulkDownload("https://" cf_url "/path_to_sign/url/site_admin/products/export?model=product", PRODUCT_FILE_NAME_FORMAT)
df.to_csv('csvs/products.csv', index = False)
df = bulkDownload("https://" cf_url "/path_to_sign/url/site_admin/signatures/export?filterrific[sorted_by]=signed_at_desc&model=signature", SIGNATURE_FILE_NAME_FORMAT)
df.to_csv('csvs/signatures.csv', index = False)
df = bulkDownload("https://" cf_url "/path_to_sign/url/site_admin/orders/export?filterrific[sorted_by]=created_at_desc&model=customer", ORDER_FILE_NAME_FORMAT)
df.to_csv('csvs/customers.csv', index = False)
df = bulkDownload("https://" cf_url "/path_to_sign/url/site_admin/addresses/export?model=address", ADDRESS_FILE_NAME_FORMAT)
df.to_csv('csvs/address.csv', index = False)
driver.quit()
CodePudding user response:
You need to ensure that the float does have a value otherwise it cannot be coverted to an integer. Carry out an extra check with if math.isnan(x): and add the import math statement at top of script.
for index, row in df.iterrows():
for c, x in enumerate(row):
if isinstance(x, float):
if math.isnan(x):
df.iloc[index, c] = 0 # zero or whatever value when NaN is identified
else:
df.iloc[index, c] = int(x)
return df