I am using the Google Drive API to get file revisions, then I am interested in reading (not downloading) the contents of each file revision to a pandas dataframe. I have been able to get the revision Ids, but reading the contents is what is problematic. I have tried the following code and getting a googleapiclient.errors.HttpError. If I change it from get_media to just get which returns alt=json, I do not get the file contents but the revision metadata. Will appreciate help here:
import io
import pandas as pd
from google_apis import create_service
from googleapiclient.http import MediaIoBaseDownload
import urllib
import csv
import requests
CLIENT_FILE = 'client_secret.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive']
# Retrieve file revision history
service = create_service(CLIENT_FILE, API_NAME, API_VERSION, SCOPES)
def get_file_revision_history(file_id):
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
while nextPageToken:
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000,
pageToken=nextPageToken
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
return revisions
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY'
revision_history = get_file_revision_history(file_id)
print(revision_history)
df = pd.json_normalize(revision_history)
#df.to_csv('revision history ({0}).csv'.format(file_id), index=False)
#read the file contents
revision_history_id = '104'
res = service.revisions().get_media(
fileId = file_id,
revisionId = revision_history_id
).execute()
#uri = res.uri
print(res)
CodePudding user response:
I thought that when I saw your file ID and your revision ID of '104', the file might be Google Docs (Document, Spreadsheet, Slides, and so on). In this case, unfortunately,
get_mediamethod cannot be used. Here,
exportmethod is required to be used. But, unfortunately, it seems that
revisions()has no method of
export. So, in this case, the export link retrieved from
service.revisions().get()` is used for downloading the data.
And, your goal is to put the exported values on pandas. From this, I guessed that the file of your file ID might be Google Spreadsheet. If my understanding is correct, when this is reflected in a python script, how about the following sample script?
Sample script:
service = build("drive", "v3", credentials=creds) # Please use your script.
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY' # This is from your showing script. From your question, I guessed that this might be Google Spreadsheet.
mimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" # In this case, Google Spreadsheet is exported as XLSX format.
sheet = "Sheet1" # Please set the sheet name you want to retrieve the values.
revision_history_id = '104' # Please set the revision ID.
res = service.revisions().get(fileId=file_id, revisionId=revision_history_id, fields="*").execute()
link = res.get("exportLinks").get(mimeType)
if link:
data = requests.get(link, headers={"Authorization": "Bearer " creds.token})
values = pd.read_excel(BytesIO(data.content), usecols=None, sheet_name=sheet)
print(values)
Here, the access token is retrieved from
creds
ofservice = build("drive", "v3", credentials=creds)
. Please be careful about this.In this case,
from io import BytesIO
,import pandas as pd
, andimport requests
are also used.When this script is run, the Google Spreadsheet is exported with the revision ID as XLSX format. And, the values from the expected sheet are put to pandas.
Reference:
CodePudding user response:
After giving it another thought, I finally got it right like this:
import pandas as pd
from google_apis import create_service
CLIENT_FILE = 'client_secret.json'
API_NAME = 'drive'
API_VERSION = 'v3'
SCOPES = ['https://www.googleapis.com/auth/drive']
# Retrieve file revision history
service = create_service(CLIENT_FILE, API_NAME, API_VERSION, SCOPES)
def get_file_revision_history(file_id):
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
while nextPageToken:
response = service.revisions().list(
fileId=file_id,
fields='*',
pageSize=1000,
pageToken=nextPageToken
).execute()
revisions = response.get('revisions')
nextPageToken = response.get('nextPageToken')
return revisions
file_id = '1E8Wbd80CbFlFSHYZQkApXMM9EQOz1lQRl4m3rfq-vdY'
revision_history = get_file_revision_history(file_id)
exportlinks = list(map(lambda x: x["exportLinks"], revision_history))
csv_urls = list(map(lambda y: y["text/csv"], exportlinks))
modified_time = list(map(lambda z: z["modifiedTime"], revision_history))
appended_data = []
for i,j in zip(csv_urls,modified_time):
res = service._http.request(i)
rn_string_data = list(res)[1]
data = list(map(lambda x: x.split(','),rn_string_data.decode('utf-8').split("\r\n")))
df = pd.DataFrame(data[1:],columns=['employee_name','hourly_rate','currency'])
df['ModifiedTime'] = j
appended_data.append(df)
appended_data = pd.concat(appended_data)
print(appended_data)
``