float' object has no attribute 'split' python-CodePudding

i am newbie with Python. i have this Python code to split Columns from csv array:

import pandas as pd
import re

df = pd.read_csv("test.csv")

#get data from traffic
name_regex = r"(?<=(\'name\': ))[^,]*(?=,)"
medium_regex = r"(?<=(\'medium\': ))[^,]*(?=,)"
source_regex = r"(?<=(\'source\': ))[^,]*(?=})"

def get_name(x):
    return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_medium(x):
    return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_source(x):
    return [re.search(source_regex, line).group().replace("'", "") for line in x.split("\n")]

df["traffic_name"] = df["traffic"].apply(get_name)
df["traffic_medium"] = df["traffic"].apply(get_medium)
df["traffic_source"] = df["traffic"].apply(get_source)
del df["traffic"]

final_df = df.explode(["traffic_name", "traffic_medium", "traffic_source"])

print(final_df)

final_df.to_csv("traffic.csv")

this is my csv data:

,traffic_source
0,"{'name': '(test1)', 'medium': '(no1)', 'source': '(yes)'}"
1,
2,"{'name': '(test1)', 'medium': 'no2', 'source': 'yes1'}"
3,
4,"{'name': '(test2)', 'medium': 'no3', 'source': 'yes'}"

when i run the python code , there is an Error. Can you please help? Thank you

~\AppData\Local\Temp\ipykernel_15228\2852501840.py in get_name(x)
     12 
     13 def get_name(x):
---> 14     return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
     15 def get_medium(x):
     16     return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]

AttributeError: 'float' object has no attribute 'split'

CodePudding user response：

The first column in your CSV file is a number (being treated as a float). Numbers cannot be split.

Additionally, your CSV data looks like it's storing JSON in the second column. Rather than writing regular expressions, just decode the string using the json library.

CodePudding user response：

This error happens because Pandas treat the first and third rows of the traffic column (There is no value, which means nan) as a float, you can remove float rows using:

df=df[df['traffic'].apply(lambda x: isinstance(x, str))]

Whole solution:

import pandas as pd
from io import StringIO
import re

#I use this to regenerate your CSV
CSV=StringIO("""
,traffic
0,"{'name': '(test1)', 'medium': '(no1)', 'source': '(yes)'}"
1,
2,"{'name': '(test1)', 'medium': 'no2', 'source': 'yes1'}"
3,
4,"{'name': '(test2)', 'medium': 'no3', 'source': 'yes'}"
""")


df = pd.read_csv(CSV, sep=",")


name_regex = r"(?<=(\'name\': ))[^,]*(?=,)"
medium_regex = r"(?<=(\'medium\': ))[^,]*(?=,)"
source_regex = r"(?<=(\'source\': ))[^,]*(?=})"

def get_name(x):
  return [re.search(name_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_medium(x):
      return [re.search(medium_regex, line).group().replace("'", "") for line in x.split("\n")]
def get_source(x):
      return [re.search(source_regex, line).group().replace("'", "") for line in x.split("\n")]


df=df[df['traffic'].apply(lambda x: isinstance(x, str))] 

df["traffic_name"] =   df["traffic"].apply(get_name)
df["traffic_medium"] = df["traffic"].apply(get_medium)
df["traffic_source"] = df["traffic"].apply(get_source)
del df["traffic"]

final_df = df.explode(["traffic_name", "traffic_medium", "traffic_source"])

print(final_df)

final_df.to_csv("traffic.csv")