import pandas as pd
import numpy as np
import sklearn as preprocessing
country ={'data source':['data','country name','brazil','switzerland','germany','denmark','spain','france','japan','greece','iran','kuwait','morocco','nigeria','qatar','sweden','india','world'],
'unnamed1':['nan','country code','BRA','CHE','DEU','DNK','ESP','FRA','JPN','GRC','IRN','KWT','MAR','NGA','QAT','SWE','IND','WLD'],
'unnamed2':[2016,'population growth',0.817555711,1.077221168,1.193866758,0.834637611,-0.008048086,0.407491036,-0.115284177,-0.687542545,1.1487886,2.924206194,'nan',1.148214693,1.18167997],
'unnamed3':['nan','total population',207652865,8372098,82667685,'nan',46443959,66896109,126994511,10746740,80277428,4052584,35276786,185989640,2569804,9903122,1324171354,7442135578],
'unnamed4':['area(sq.km)',8358140,39516,348900,42262,500210,547557,394560,128900,16287601,'nan',446300,910770,11610,407310,2973190,129733172.7]}
my_df = pd.DataFrame(country, index=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17], columns=['data source','unnamed1','unnamed2','unnamed3','unnamed4'])
print(my_df)
and this is the error:
Traceback (most recent call last):
File "c:/Users/se7en/Desktop/AI/skl.py", line 11, in <module>
my_df = pd.DataFrame(country, index=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17], columns=['data source','unnamed1','unnamed2','unnamed3','unnamed4'])
File "C:\Program Files\Python37\lib\site-packages\pandas\core\frame.py", line 614, in __init__
mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\construction.py", line 465, in dict_to_mgr
arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\construction.py", line 136, in arrays_to_mgr
arrays, arr_names, axes, consolidate=consolidate
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1776, in create_block_manager_from_arrays
raise construction_error(len(arrays), arrays[0].shape, axes, e)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1773, in create_block_manager_from_arrays
blocks = _form_blocks(arrays, names, axes, consolidate)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1863, in _form_blocks
items_dict["ObjectBlock"], np.object_, consolidate=consolidate
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1903, in _simple_blockify
values, placement = _stack_arrays(tuples, dtype)
File "C:\Program Files\Python37\lib\site-packages\pandas\core\internals\managers.py", line 1959, in _stack_arrays
stacked[i] = arr
ValueError: could not broadcast input array from shape (15,) into shape (18,)
CodePudding user response:
All the lists/arrays in dictionary must have the same length for the DataFrame constructor to accept the input.
This is not the case with your data:
{k:len(v) for k,v in country.items()}
output:
{'data source': 18,
'unnamed1': 18,
'unnamed2': 15,
'unnamed3': 18,
'unnamed4': 17}
Either trim the elements to the min length, or pad the shortest ones to the max length.
Another option to circumvent this might be to use a dictionary of Series, which will do the padding job automatically:
df = pd.DataFrame({k:pd.Series(v) for k,v in country.items()})
output:
data source unnamed1 unnamed2 unnamed3 unnamed4
0 data nan 2016 nan area(sq.km)
1 country name country code population growth total population 8358140
2 brazil BRA 0.817556 207652865 39516
3 switzerland CHE 1.077221 8372098 348900
4 germany DEU 1.193867 82667685 42262
5 denmark DNK 0.834638 nan 500210
6 spain ESP -0.008048 46443959 547557
7 france FRA 0.407491 66896109 394560
8 japan JPN -0.115284 126994511 128900
9 greece GRC -0.687543 10746740 16287601
10 iran IRN 1.148789 80277428 nan
11 kuwait KWT 2.924206 4052584 446300
12 morocco MAR nan 35276786 910770
13 nigeria NGA 1.148215 185989640 11610
14 qatar QAT 1.18168 2569804 407310
15 sweden SWE NaN 9903122 2973190
16 india IND NaN 1324171354 129733172.7
17 world WLD NaN 7442135578 NaN
NB. you should clarify the output you expect as it seems here that your lists are mixing labels and data