i am newbie with Python. i have this Python code to split Columns from csv array:
import pandas as pd
import re
df = pd.read_csv("test.csv")
#get data from traffic
name_regex = r"(?<=(\'name\': ))[^,]*(?=,)"
medium_regex = r"(?<=(\'medium\': ))[^,]*(?=,)"
source_regex = r"(?<=(\'source\': ))[^,]*(?=})"
def get_name(x):
return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_medium(x):
return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_source(x):
return [re.search(source_regex, line).group().replace("'", "") for line in x.split("\n")]
df["traffic_name"] = df["traffic"].apply(get_name)
df["traffic_medium"] = df["traffic"].apply(get_medium)
df["traffic_source"] = df["traffic"].apply(get_source)
del df["traffic"]
final_df = df.explode(["traffic_name", "traffic_medium", "traffic_source"])
print(final_df)
final_df.to_csv("traffic.csv")
this is my csv data:
,traffic_source
0,"{'name': '(test1)', 'medium': '(no1)', 'source': '(yes)'}"
1,
2,"{'name': '(test1)', 'medium': 'no2', 'source': 'yes1'}"
3,
4,"{'name': '(test2)', 'medium': 'no3', 'source': 'yes'}"
when i run the python code , there is an Error. Can you please help? Thank you
~\AppData\Local\Temp\ipykernel_15228\2852501840.py in get_name(x)
12
13 def get_name(x):
---> 14 return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
15 def get_medium(x):
16 return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]
AttributeError: 'float' object has no attribute 'split'
CodePudding user response:
The first column in your CSV file is a number (being treated as a float). Numbers cannot be split.
Additionally, your CSV data looks like it's storing JSON in the second column. Rather than writing regular expressions, just decode the string using the json library.
CodePudding user response:
This error happens because Pandas
treat the first and third rows of the traffic
column (There is no value, which means nan) as a float
, you can remove float
rows using:
df=df[df['traffic'].apply(lambda x: isinstance(x, str))]
Whole solution:
import pandas as pd
from io import StringIO
import re
#I use this to regenerate your CSV
CSV=StringIO("""
,traffic
0,"{'name': '(test1)', 'medium': '(no1)', 'source': '(yes)'}"
1,
2,"{'name': '(test1)', 'medium': 'no2', 'source': 'yes1'}"
3,
4,"{'name': '(test2)', 'medium': 'no3', 'source': 'yes'}"
""")
df = pd.read_csv(CSV, sep=",")
name_regex = r"(?<=(\'name\': ))[^,]*(?=,)"
medium_regex = r"(?<=(\'medium\': ))[^,]*(?=,)"
source_regex = r"(?<=(\'source\': ))[^,]*(?=})"
def get_name(x):
return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_medium(x):
return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_source(x):
return [re.search(source_regex, line).group().replace("'", "") for line in x.split("\n")]
df=df[df['traffic'].apply(lambda x: isinstance(x, str))]
df["traffic_name"] = df["traffic"].apply(get_name)
df["traffic_medium"] = df["traffic"].apply(get_medium)
df["traffic_source"] = df["traffic"].apply(get_source)
del df["traffic"]
final_df = df.explode(["traffic_name", "traffic_medium", "traffic_source"])
print(final_df)
final_df.to_csv("traffic.csv")