I am new to data handling . I need to create python program to search a record from a samplefile1 in samplefile2. i am able to achieve it but for each record out of 200 rows in samplefile1 is looped over 200 rows in samplefile2 , it took 180 seconds complete execution time.
I am looking for something to be more time efficient so that i can do this task in minimum time .
My actual Dataset size is : 9million -> samplefile1 and 9million --> samplefile2.
Here is my code using Pandas.
sample1file1 rows:
number='7777777777' subscriber-id="7777777777" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
number='7777777778' subscriber-id="7777777778" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
number='7777777779' subscriber-id="7777777779" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
.........100 rows
samplefile2 rows
number='7777777777' subscriber-id="7777777777" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
number='7777777778' subscriber-id="7777777778" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
number='7777777769' subscriber-id="7777777779" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip e2u"regexp="!^(.*)$!sip: [email protected];user=phone!"replacement=[]};
........100 rows
import time
import pandas as pd
def timeit(func):
"""
Decorator for measuring function's running time.
"""
def measure_time(*args, **kw):
start_time = time.time()
result = func(*args, **kw)
print("Processing time of %s(): %.2f seconds."
% (func.__qualname__, time.time() - start_time))
return result
return measure_time
@timeit
def func():
df = pd.read_csv("sample_2.txt", names=["A1"], skiprows=0, sep=';')
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
finaldatafile1=df.fillna("TrackRow")
df1=pd.read_csv("sample_1.txt",names=["A1"],skiprows=0,sep=';')
df1.drop(df.filter(regex="Unname"),axis=1, inplace=True)
finaldatafile2=df1.fillna("TrackRow")
indexdf=df.index
indexdf1=df1.index
##### for loop for string to be matched (small datasets#######
for i in range(0,len(indexdf)-1):
lookup_value=finaldatafile1.iloc[[i]].to_string()
# print(lookup_value)
######### for loop for lookup dataset( large dataset #########
for j in range(0,len(indexdf1)-1):
match_value=finaldatafile2.iloc[[j]].to_string()
if i is j:
print (f"Its a match on lookup table position {j} and for string {lookup_value}")
else:
print("no match found in complete dataset")
if __name__ == "__main__":
func()
CodePudding user response:
I don't think using Pandas is helping here as you are just comparing whole lines. An alternative approach would be to load the first file as a set of lines. Then enumerate over the lines in the second file testing if it is in the set. This will be much faster:
@timeit
def func():
with open('sample_1.txt') as f_sample1:
data1 = set(f_sample1.read().splitlines())
with open('sample_2.txt') as f_sample2:
data2 = f_sample2.read().splitlines()
for index, entry in enumerate(data2):
if entry in data1:
print(f"It's a match on lookup table position {index} and for string\n{entry}")
else:
print("no match found in complete dataset")