from numpy.core.defchararray import find
df = pd.DataFrame({
"string": ["abc", "def", "ghi"],
"substring": ["bc", "e", "ghi"]
})
I got following to determine the start position but I am not sure how to get the end position:
df.assign(start=find(df['string'].values.astype(str),df['substring'].values.astype(str)))
expected result:
string substring start end
abc bc 1 2
def e 1 1
ghi ghi 0 2
CodePudding user response:
Use list comprehension with :=
for variable assignments within expression for end
string values in tuples, last assign to new columns:
df[['start','end']]=[(c:=a.find(b),c len(b)-1) for a,b in zip(df['string'],df['substring'])]
print (df)
string substring start end
0 abc bc 1 2
1 def e 1 1
2 ghi ghi 0 2
Your solution should be changed with same logic:
from numpy.core.defchararray import find
df=df.assign(start=find(df['string'].values.astype(str),df['substring'].values.astype(str)),
end = lambda x: x['start'] x['substring'].str.len() - 1)
print (df)
string substring start end
0 abc bc 1 2
1 def e 1 1
2 ghi ghi 0 2
If no match is return -1
, so possible solution should be set NaN
s in next step:
df = pd.DataFrame({
"string": ["ab7c", "def", "ghi"],
"substring": ["bc", "e", "ghi"]
})
print (df)
string substring
0 ab7c bc
1 def e
2 ghi ghi
from numpy.core.defchararray import find
df=df.assign(start=find(df['string'].values.astype(str),df['substring'].values.astype(str)),
end = lambda x: x['start'] x['substring'].str.len() - 1)
df[['start','end']] = df[['start','end']].mask(df['start'].eq(-1))
print (df)
string substring start end
0 ab7c bc NaN NaN
1 def e 1.0 1.0
2 ghi ghi 0.0 2.0
CodePudding user response:
Another way of doing it with better code readability can be as follows
## this will ensure if not found it will return None
def index_of_substring(main_string, substring):
try:
start_index = main_string.index(substring)
end_index = start_index len(substring) -1
return(pd.Series([start_index,end_index]))
except ValueError:
return(pd.Series([None,None]))
## Then you call the function as follows
df = pd.DataFrame({
"string": ["abc", "def", "ghi"],
"substring": ["bc", "e", "ghi"]
})
df[["start","end"]] = df.apply(lambda row:index_of_substring(row['string'],row["substring"]),axis=1)
df.head()
string substring start end
0 abc bc 1 2
1 def e 1 1
2 ghi ghi 0 2