Merge to lists in reverse chronological order using regular expression python-CodePudding

I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the ' ' method. These are the two .txt files I am trying to merge.

file 1:

poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00

file 2:

ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

import re
import sys

def read_tweets(file):

    records_list = []
    with open(file, 'r') as f:
        for line in f:
            match = re.search(r'@(\w ) "(.*)" (\d ) (\d ) (\d ) (\d :\d :\d )', line)
            if match:
                records_list.append({
                    'tweeter': match.group(1),
                    'tweet': match.group(2),
                    'year': int(match.group(3)),
                    'month': int(match.group(4)),
                    'day': int(match.group(5)),
                    'time': match.group(6)
                })
    return records_list

def merge_tweets(list1, list2):
    return list1   list2

def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(
                f'@{record["tweeter"]} "{record["tweet"]}" {record["year"]} {record["month"]} {record["day"]} {record["time"]}\n')

def main():
    if len(sys.argv) != 4:
        print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('Reading files...')
    records_list1 = read_tweets(file1)
    records_list2 = read_tweets(file2)

    if len(records_list1) > len(records_list2):
        print(f'{file1} contained the most tweets with {len(records_list1)}.')
    elif len(records_list2) > len(records_list1):
        print(f'{file2} contained the most tweets with {len(records_list2)}.')
    else:
        print(f'{file1} and {file2} both contained {len(records_list1)} tweets.')

    print('\nMerging files...')
    records_list = merge_tweets(records_list1, records_list2)
    print('Files merged.')

    print('\nWriting file...')
    write_tweets(records_list, output_file)
    print('File written.')

CodePudding user response：

I added some helper functions. A few comments:

read_files() does the merging, so I eliminated merge_tweets()
datetime is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord["timestamp"] in write_tweets(), and write again in your own format)
these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.

import re
import sys
from datetime import datetime


def read_files(file1, file2):
    records_list, file_lengths = [], []
    for file in (file1, file2):
        count = 0  # I avoided enumerate() to avoid exceptions from empty files
        with open(file, 'r') as f:
            for line in f:
                records_list.append(read_tweet(line))
                count  = 1
        file_lengths.append(count)
    print('Files merged.')
    return records_list, file_lengths


def read_tweet(line: str):
    match = re.search(r'(\w ) "(.*)" (\d ) (\d ) (\d ) (\d ):(\d ):(\d )', line)
    if match:
        return {
            'tweeter': match.group(1),
            'tweet': match.group(2),
            'timestamp': datetime(
                year=int(match.group(3)),
                month=int(match.group(4)),
                day=int(match.group(5)),
                hour=int(match.group(6)),
                minute=int(match.group(7)),
                second=int(match.group(8)),
            ),
        }


def sort_tweets(records_list):
    return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)


def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')


def main():
    if len(sys.argv) != 4:
        print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('Reading files...')
    records_list = read_files(file1, file2)

    records_list_values = sort_tweets(records_list[0])
    records_list1_count, records_list2_count = records_list[1]

    if records_list1_count > records_list2_count:
        print(f'{file1} contained the most tweets with {records_list1_count}.')
    elif records_list2_count > records_list1_count:
        print(f'{file2} contained the most tweets with {records_list2_count}.')
    else:
        print(f'{file1} and {file2} both contained {records_list1_count} tweets.')

    print('\nWriting file...')
    write_tweets(records_list_values, output_file)
    print('File written.')


if __name__ == "__main__":
    main()

Output:

@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47