I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the ' ' method. These are the two .txt files I am trying to merge.
file 1:
poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00
file 2:
ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47
So far my code is
My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?
import re
import sys
def read_tweets(file):
records_list = []
with open(file, 'r') as f:
for line in f:
match = re.search(r'@(\w ) "(.*)" (\d ) (\d ) (\d ) (\d :\d :\d )', line)
if match:
records_list.append({
'tweeter': match.group(1),
'tweet': match.group(2),
'year': int(match.group(3)),
'month': int(match.group(4)),
'day': int(match.group(5)),
'time': match.group(6)
})
return records_list
def merge_tweets(list1, list2):
return list1 list2
def write_tweets(records_list, file):
with open(file, 'w') as f:
for record in records_list:
f.write(
f'@{record["tweeter"]} "{record["tweet"]}" {record["year"]} {record["month"]} {record["day"]} {record["time"]}\n')
def main():
if len(sys.argv) != 4:
print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print('Reading files...')
records_list1 = read_tweets(file1)
records_list2 = read_tweets(file2)
if len(records_list1) > len(records_list2):
print(f'{file1} contained the most tweets with {len(records_list1)}.')
elif len(records_list2) > len(records_list1):
print(f'{file2} contained the most tweets with {len(records_list2)}.')
else:
print(f'{file1} and {file2} both contained {len(records_list1)} tweets.')
print('\nMerging files...')
records_list = merge_tweets(records_list1, records_list2)
print('Files merged.')
print('\nWriting file...')
write_tweets(records_list, output_file)
print('File written.')
CodePudding user response:
I added some helper functions. A few comments:
read_files()
does the merging, so I eliminatedmerge_tweets()
datetime
is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord["timestamp"]
inwrite_tweets()
, and write again in your own format)- these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.
import re
import sys
from datetime import datetime
def read_files(file1, file2):
records_list, file_lengths = [], []
for file in (file1, file2):
count = 0 # I avoided enumerate() to avoid exceptions from empty files
with open(file, 'r') as f:
for line in f:
records_list.append(read_tweet(line))
count = 1
file_lengths.append(count)
print('Files merged.')
return records_list, file_lengths
def read_tweet(line: str):
match = re.search(r'(\w ) "(.*)" (\d ) (\d ) (\d ) (\d ):(\d ):(\d )', line)
if match:
return {
'tweeter': match.group(1),
'tweet': match.group(2),
'timestamp': datetime(
year=int(match.group(3)),
month=int(match.group(4)),
day=int(match.group(5)),
hour=int(match.group(6)),
minute=int(match.group(7)),
second=int(match.group(8)),
),
}
def sort_tweets(records_list):
return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)
def write_tweets(records_list, file):
with open(file, 'w') as f:
for record in records_list:
f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')
def main():
if len(sys.argv) != 4:
print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print('Reading files...')
records_list = read_files(file1, file2)
records_list_values = sort_tweets(records_list[0])
records_list1_count, records_list2_count = records_list[1]
if records_list1_count > records_list2_count:
print(f'{file1} contained the most tweets with {records_list1_count}.')
elif records_list2_count > records_list1_count:
print(f'{file2} contained the most tweets with {records_list2_count}.')
else:
print(f'{file1} and {file2} both contained {records_list1_count} tweets.')
print('\nWriting file...')
write_tweets(records_list_values, output_file)
print('File written.')
if __name__ == "__main__":
main()
Output:
@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47