I have a fairly complex sequence of functions calling apis and having the result set be appended to a dataframe - the thing is when I print the dataframe during each loop of append, I see new values but at the end when the loop breaks, I only see what value for final_df ? Any thoughts as to why?
df = pd.DataFrame(columns = ['repo', 'number', 'title', 'branch', 'merged_at', 'created_at', 'authored_by', 'merged_by', 'from_version', 'to_version'] )
def get_prs(repo,pr_number):
response = requests.request("GET", pgv.github_pr_url str(repo) '/pulls/' str(pr_number), headers=pgv.headers)
response = response.json()
return response
def get_commits(repo,from_version,to_version):
response = requests.request("GET", pgv.github_commits_url str(repo) '/compare/' str(from_version) '...' str(to_version) , headers=pgv.headers)
response = response.json()
# print(len(response['commits']))
# print(response['commits'])
for i in range(0,len(response['commits'])):
# print(response['commits'][i])
# x = re.match(r"\AMerge pull request #(?P<number>\d ) from/(?P<branch>(. )\s*$)", response['commits'][i].get('commit').get('message'))
x = re.search("\AMerge pull request #(?P<number>\d ) from/(?P<branch>.*)", response['commits'][i].get('commit').get('message'))
# print(x)
if x is None:
pass
else:
# return re.search("(\d )",x.group(0)).group(0), response['commits'][i].get('branches_url')
return x.group('number'), x.group('branch')
# print(x.group('branch'))
#query GitHub to get all commits between from_version and to_version.
def return_deploy_events():
final_object = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
except Exception:
pass
# 'title':, 'branch',
# 'merged_at', 'created_at', 'authored_by', 'merged_by',
# 'from_version': response['query_result']['data']['rows'][i].get('FROM_VERSION'), 'to_version':response['query_result']['data']['rows'][i].get('TO_VERSION')},
# ignore_index = True)
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(final_df)
# final_df = json.loads(final_df.to_json(orient = 'records'))
# gec.json_to_s3(final_df, glob_common_vars.s3_resource,glob_common_vars.s3_bucket_name, 'test/test.json.gzip')
return_deploy_events()
CodePudding user response:
I think the problem is, you are assigning each rows to the same variable. So the last row will be printed at the last. So try to append each rows to result list.
def return_deploy_events():
final_object = []
result = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
result.append(final_df) # append the current row to result
except Exception:
pass
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(result) # print the final result
I just added two lines of code, but I hope it works.