Because some of the CSV files that I need to read are very large (multiple GB), I am trying to implement a progress bar that indicates the number of bytes read out of the total when reading a CSV file from a URL with pandas.
I am trying to implement something like this:
from tqdm import tqdm
import requests
from sodapy import Socrata
import contextlib
import urllib
import pandas as pd
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
response = requests.get(url, params=None, stream=True)
response.raise_for_status()
total_size = int(response.headers.get('Content-Length', 0))
block_size = 1000
df = []
last_position = 0
cur_position = 1
with tqdm(desc=url, total=total_size,
unit='iB',
unit_scale=True,
unit_divisor=1024
) as bar:
with contextlib.closing(urllib.request.urlopen(url=url)) as rd:
# Create TextFileReader
reader = pd.read_csv(rd, chunksize=block_size)
for chunk in reader:
df.append(chunk)
# Here I would like to calculate the current file position: cur_position
bar.update(cur_position - last_position)
last_position = cur_position
Is there a way to get the file position from the pandas TextFileReader somehow? Perhaps something equivalent to ftell in C for TextFileReader?
CodePudding user response:
Not thoroughly tested, but you can implement custom class with read()
method where you read from requests
response line by line and update the tqdm
bar:
import requests
import pandas as pd
from tqdm import tqdm
url = "https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no"
class TqdmReader:
def __init__(self, resp):
total_size = int(resp.headers.get("Content-Length", 0))
self.resp = resp
self.bar = tqdm(
desc=resp.url,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
)
self.reader = self.read_from_stream()
def read_from_stream(self):
for line in self.resp.iter_lines():
line = b"\n"
self.bar.update(len(line))
yield line
def read(self, n=0):
try:
return next(self.reader)
except StopIteration:
return ""
with requests.get(url, params=None, stream=True) as resp:
df = pd.read_csv(TqdmReader(resp))
print(len(df))
Prints:
https://public.tableau.com/views/PPBOpenDataDownloads/UseOfForce-All.csv?:showVizHome=no: 100%|██████████████████████████████████████████████████████████████████████████████| 2.09M/2.09M [00:00<00:00, 2.64MiB/s]
7975