I have two files that I want to compare:
-File 1 contains SHA1 hash values and the file path of executable files. (Each line is a different file.)
-File 2 is a file containing known files and their SHA1 hash value and other data. (Again, each line is a different file.)
I want to compare each line of file 1 against each line of file 2. If there is a match, I want the script to stop comparing and start with the next line of file 1. If there is no match, I want to write that file hash and path from file 1 to file 3.
Also, the filesize of File 2 is about ~30GB - so the code needs to be optimized to do the searches as efficiently and fast as possible.
Here is what I have so far, but I am not getting the desired output.
#!/usr/bin/env python
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:' '\n')
f3.close()
with open(sys.argv[1], 'r') as f1:
line_f1 = f1.readline()[0:40]
full_line = f1.readline()
with open(sys.argv[2], 'r') as f2:
next(f2)
line_f2 = f2.readline()[1:41]
for line_f1 in line_f2:
if (line_f1 == line_f2):
pass
elif (line_f1 != line_f2):
with open(sys.argv[3], 'a') as f3:
f3.write(full_line)
Here is the sample data set that I am working with (The data has been modified from its original form to allow for some matches and some that do not match.):
File 1
00062169d823ecb98e47918d61be2fe8a84a303b /usr/test1
000864f38d0a8505ee8b5618a29039fe1e644fbe /usr/test2 #NO MATCH
001988b60a8fb6a22d2bfdd442c723b477f840fb /usr/test3 #NO MATCH
001a3627d954ed621910f3c79489a63db36916ba /usr/test4 #NO MATCH
001bf3fa581c7660216b34834d7ac8dc5a75a83f /usr/test5 #NO MATCH
00307660e0c54193b9cf3630d38312acf10b4093 /usr/bin/test6 #NO MATCH
00420df3e26140830e5e298e51d48d3f3c6ffc7e /usr/bin/test7 #NO MATCH
004d06de7875d2c20b0c5b29c3c658bce24b5869 /usr/bin/test8 #NO MATCH
00512e1dd13f0389322de86027b5b5ff74acc706 /usr/bin/test9
005691edd3987a833fcff92be169a71796a4972a /usr/bin/test10 #NO MATCH
0060b09755987013a7bbe5992866f08c39db9e6b /usr/sbin/test11
0063c0d2ab0c4e89dd87a455bb4142e2aa954b62 /usr/sbin/test12
00646760493c4fd9f4644572449c702b2c43116b /usr/sbin/test13
006732aaf4649b21878f4077af807ac34c71dd5d /usr/sbin/test14
006830799e5673d0d8bb988bfc43b6874661f90f /usr/sbin/test15
006cc4f1004af878422bb0775592769f0b7add42 /home/admin/Desktop/test16
009a4244a3e8932c91c1d1eb2057c67a2a15087f /home/admin/Desktop/test17
00a04e033f1191bb1a993777c28b5ba729ceac28 /home/admin/Desktop/test18
00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10 /home/admin/Desktop/test19
00ab49b182b0480f3e7eacff7f1d9505dc0a3a32 /home/admin/Desktop/test20
00b64e72857c63592031d2c682c4563e22a35b98 /home/admin/Documents/test21
00b73f67c6f4a6ffe4f9842f57271ed91225e530 /home/admin/Documents/test22
00ba66e659b9b519401ac69c8cf9f3901055ed42 /home/admin/Documents/test23 #NO MATCH
00c0f1ed488ed13f61f1d77d8b1bf1c3bee1b7e5 /home/admin/Documents/test24 #NO MATCH
00c784a2b0a2818e16ce96eddb676ab17f594e9b /home/admin/Documents/test25
File 2
"SHA-1","MD5","CRC32","FileName","FileSize","ProductCode","OpSystemCode","SpecialCode"
"00b73f67c6f4a6ffe4f9842f57271ed91225e530","344428FA4BA313712E4CA9B16D089AC4","7516A25F",".text._ZNSt14overflow_errorC1ERKSs",33,219181,"362",""
"00b64e72857c63592031d2c682c4563e22a35b98","F46CA74CA3D89E9D3CF8D8E5CD77842D","2F9CC135","__DATA__mod_init_func",772,218747,"362",""
"00ab49b182b0480f3e7eacff7f1d9505dc0a3a32","8ED4B4ED952526D89899E723F3488DE4","7A5407CA","wow64_microsoft-windows-i..timezones.resources_31bf3856ad364e35_10.0.16299.579_de-de_f24979c73226184d.manifest",2520,190718,"362",""
"00aacb0db059d1d8ff0ccafbeeafc7e32ba4fd10","497C460BBA43530494F37DF7DE3A5FF4","46B80AC7","bpa10x.ko",12944,17066,"362",""
"00a04e033f1191bb1a993777c28b5ba729ceac28","7C36BE0D2BF2520D564D36C6F4241B4F","66E07FC3",".text",1130496,223308,"362",""
"009a4244a3e8932c91c1d1eb2057c67a2a15087f","EAEB051BACDB9D67605659E3DF80C48C","74F27585","package_3482_for_kb4462939~31bf3856ad364e35~amd64~~10.0.1.5.cat",10660,204580,"362",""
"006cc4f1004af878422bb0775592769f0b7add42","E7990319759290BB6E0D17D7C685D203","F6A2F49D","ultoa.o",692,220911,"362",""
"006830799e5673d0d8bb988bfc43b6874661f90f","9A872042A9CD96B4FB13901000B91982","97D3B7E8","microsoft-windows-internet-browserppipro-package~31bf3856ad364e35~x86~sl-si~10.0.19041.906.cat",8897,236128,"362",""
"006732aaf4649b21878f4077af807ac34c71dd5d","3491EE38124BF5382D0828C5209C83B5","6CC040F2","Batman_Seventies.POR",90,213814,"362",""
"0000030F6D93EC90BDEA54B08BF7B512B13F55F9","CC6B8BA59F74F251DBCA14962A156C9D","ECEDDFD8",".rodata",173816,220501,"362",""
"000003191A199BFA961C18A6F71FF2ED04D0F9DA","84B2CE4DC226E61470EC240593CCBFF3","CC6201BD",".rdata",5120,221574,"362",""
"0000034F77D9314B1B94DBDA3031BECE1198D067","FE330C56554EF007D38C89764864E365","71C6F991","arm64_49016ecbe73216140477e3b16492e87f_31bf3856ad364e35_10.0.17134.81_none_ae8f44b72b46370a.manifest",705,188511,"362",""
"000003802D91BC41F5C89BB6115903ABC35372AB","F85BA698CA9E66D39BA8E223602E136E","41195B49",".gnu.version",192,226194,"362",""
"00646760493c4fd9f4644572449c702b2c43116b","858DEA54B3CBE4664F6652C37180A8AE","210F55CB","ScBrPls1.A05D7955_E27E_48E7_843F_456A4A59DC3A",456632,226257,"362",""
"0063c0d2ab0c4e89dd87a455bb4142e2aa954b62","0DD50DF49C7E9C01B97038FAE5A077E1","7B608B44",".text",5460480,182069,"362",""
"0060b09755987013a7bbe5992866f08c39db9e6b","849C766653FB4C4C6E9727175FE4974B","16C39D0D",".rela.rodata",23328,263769,"362",""
"00512e1dd13f0389322de86027b5b5ff74acc706","DBAE160A16763542AA8C6A7DBCDB05C5","D6E3587F","GodHitWallAdditiveRight-SkeletalAnimation.bin",20896,198268,"362",""
"00c784a2b0a2818e16ce96eddb676ab17f594e9b","41D0DD202B31F022CDB92802567058A5","7AD24105","redbull.erp",8663417,257490,"362",""
"00000760AB4465B9CE24F569BB49958E36723DEF","8D7D1546A3F803D7B4D5428C756B8E36","50D73BD5","gtru_t26596_s54847_6_p_vzdavani_drPI.ogg",38039,200078,"362",""
"00000767994D0189AED15752A566C8D48E1CBDA0","093941287F5F5A9246395ECC29EDFD27","7ED4CBF1","ServDeps.dll.mui",5120,288137,"362",""
"0000079B4CB3F9C043F48DE6F28ED0A6B9708AC7","7C88ED5E462B23E653455484F4BD2D0A","8F32BA13",".data",216,263765,"362",""
"000007B928F4C211CC8ED3C9707196A7C5BA3AF8","68563E2BFC732E10E885BD2DCF49F2EF","34940E24","microsoft-windows-businessscanning-feature-package~31bf3856ad364e35~amd64~pt-br~6.1.7601.17514.mum",1541,201424,"362",""
"00062169d823ecb98e47918d61be2fe8a84a303b","8DFD90838375D367A11C9064092A6807","409D0B8D",".text",33792,163489,"362",""
CodePudding user response:
A few remarks:
- When using
with open(..)
you don't need to close your file, it's closed automatically at the end of the block - iterating through the lines of a file
f
is as simple asfor line in f:
So you can do the following:
import sys
with open(sys.argv[3], 'w') as f3:
f3.write('Hash Values of Unknown/Modified Executables:' '\n')
with open(sys.argv[1], 'r') as f1:
for line_f1 in f1:
matched = False
hash1, path1, *_ = line_f1.split()
with open(sys.argv[2], 'r') as f2:
for line_f2 in f2:
if hash1 in line_f2:
matched = True
break
if not matched:
f3.write(line_f1)