In the code below I get the fileID of a csv file on Google Drive. Now, I want to store the file content directly in a pandas frame instead of downloading the csv file and afterwards extracting the data (as shown in the code).
import io
import os.path
import pandas as pd
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
# Login to Google Drive
def login():
creds = None
# The file token.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.json'):
creds = Credentials.from_authorized_user_file('token.json', SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
print ("Login to your to your Google Drive account which holds/shares the file database")
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'./src/credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open('token.json', 'w') as token:
token.write(creds.to_json())
# Return service
service = build('drive', 'v3', credentials=creds)
return service
# Download files from Google Drive
def downloadFile(file_name):
# Authenticate
service = login()
# Search file by name
response = service.files().list(q=f"name='{file_name}'", spaces='drive', fields='nextPageToken, files(id, name)').execute()
for file in response.get('files', []):
file_id = file.get('id')
# Download file file if it exists
if ("file_id" in locals()):
request = service.files().get_media(fileId=file_id)
fh = io.FileIO(f"./data/{file_name}.csv", "wb")
downloader = MediaIoBaseDownload(fh, request)
print (f"Downloading {file_name}.csv")
else:
print (f"\033[1;31m Warning: Can't download >> {file_name} << because it is missing!!!\033[0;0m")
return
downloadFile("NameOfFile")
Is there any way to achieve this? Thanks a lot for your help
CodePudding user response:
From The problem is to be able to do that I need the file's URL but I'm not able to retrieve it.
, I thought that your file might be Google Spreadsheet. When the file is Google Spreadsheet, webContentLink
is not included in the retrieved metadata.
If my understanding of your situation is correct, how about the following modification?
Modified script:
From:
file_id = file.get('id')
# !!! Here, I would like to get the URL of the file and download it to a pandas data frame !!!
file_url = file.get("webContentLink")
To:
file_id = file.get('id')
file_url = file.get("webContentLink")
if not file_url:
request = service.files().export_media(fileId=file_id, mimeType='text/csv')
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Download %d%%" % int(status.progress() * 100))
fh.seek(0)
df = pd.read_csv(fh)
print(df)
- In this modification, the Google Spreadsheet is exported as the CSV data using Drive API, and the exported data is put to the dataframe.
- In this modification, please add
import io
andfrom googleapiclient.http import MediaIoBaseDownload
.
Note:
- In this case, the Google Spreadsheet is exported as the CSV data using Drive API. So please include the scope of
https://www.googleapis.com/auth/drive.readonly
orhttps://www.googleapis.com/auth/drive
. When your scope is onlyhttps://www.googleapis.com/auth/drive.metadata.readonly
, an error occurs. Please be careful this.
Reference:
Added:
When the file is the CSV data, please modify as follows.
file_id = file.get('id')
request = service.files().get_media(fileId=file_id)
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
print("Download %d%%" % int(status.progress() * 100))
fh.seek(0)
df = pd.read_csv(fh)
print(df)