In a project, I receive json that I need to read into a pandas data.frame.
The format looks like the one below (with more columns and rows):
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
I'm able to split the strings, but my types are not what I want. Is there an automated way to convert the columns in u
?
from io import StringIO
import pandas as pd
TESTDATA = StringIO("""
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
""")
df = pd.read_json(TESTDATA)
df.head(10)
vnames = df.columns[0].split(';')
u = (df[df.columns[0]].str.split(';', expand=True)
.set_axis(vnames, axis=1, inplace=False)).convert_dtypes()
print(u.head(10))
print(u.info())
I want the Dtype to be int64, datetime64, float64, str
.
CodePudding user response:
You could do the following:
from io import StringIO
import pandas as pd
import numpy as np
TESTDATA = StringIO("""
{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}
""")
df = pd.read_json(TESTDATA)
df.head(10)
vnames = df.columns[0].split(';')
u = (df[df.columns[0]].str.split(';', expand=True)
.set_axis(vnames, axis=1, inplace=False))
u = u.apply(lambda x: x.str.strip()).replace('NA', np.nan)
u = u.to_json()
u = pd.read_json(u).convert_dtypes()
print(u.head(10))
print(u.info())
CodePudding user response:
Try explicitly typecasting the string values before creating the DataFrame, like in this example:
import json
import pandas as pd
s_src = '''{ "a;b;c;d":{
"1":"100;14/09/2020;0.5;XK3",
"2":"NA;17/09/2020;0.95;NA",
"3":"102;NA;NA;KZ2"}}'''
s = json.loads(s_src)
# per-column type conversion
typeconv = [int, pd.to_datetime, float, str]
for k1, subd in s.items():
cols = k1.split(';')
rows = []
for k, v in subd.items():
row = v.split(';')
conv_row =[]
for cvt, r in zip(typeconv, row):
# screen for missing values
if r == 'NA':
conv_row.append(None)
else:
# apply the conversion function for this column
conv_row.append(cvt(r))
rows.append(conv_row)
df = pd.DataFrame(rows, columns=cols)