Home > Net >  Python 3: Performing "left join" of two lists of lists without using pandas
Python 3: Performing "left join" of two lists of lists without using pandas

Time:01-01

Due to a restriction on being able to use pandas (not allowed) I am trying to do a left join operation between two csv files. I'm struggling. Here is an example:

import csv

def read_csv(path):
    file = open(path, "r")
    content_list = []    
    for line in file.readlines():
        record = line.split(",")
        for item in range(len(record)):
            record[item] = record[item].replace("\n","")    
        content_list.append(tuple(record))
    return content_list 

lookup_list = read_csv("lookup.csv")
data_list = read_csv("data.csv")

print("list with id and name:")
print(lookup_list)
print("list with id, age, weight:")
print(data_list)

result =list()
data_dict = {x[0]: x for x in data_list[1:]}

for left in lookup_list[1:]:
    if left[0] in data_dict:
        result.append(data_dict.get(left[0])   left[1:])

print("Result of merge:")
print(result)

list with id and name:

[('id', 'name'), ('123', 'Robin'), ('221', 'Larry'), ('331', 'Wilson'), ('412', 'Jack')]

list with id, age, weight:

[('id', 'age', 'weight'), ('123', '47', '320'), ('221', '47', '190'), ('331', '25', '225'), ('412', '21', '180'), ('110', '14', '150')]

Result of merge:

[('123', '47', '320', 'Robin'), ('221', '47', '190', 'Larry'), ('331', '25', '225', 'Wilson'), ('412', '21', '180', 'Jack')]

As the lookup_list does not have an entry for id 110, it is not included in the results. I need it to be included in the results with an empty value for 'name'. This is where I'm struggling.

This was so much easier with pandas but our automation engineers are restricting us to only libraries/modules included with the standard python distribution.

Thanks in advance for your help.

CodePudding user response:

This solution does as I described, and reads the lists into dictionaries. You can then write a new CSV file with the combined results.

import csv
from pprint import pprint

def read_csv(path):
    file = open(path, "r")
    contents = {}
    header = []
    for line in file.readlines():
        record = line.strip().split(",")
        if not header:
            header = record
        else:
            contents[record[0]] = {a:b for a,b in zip(header,record)}
    return contents

lookup_list = read_csv("xxx.csv")
data_list = read_csv("yyy.csv")

print("list with id and name:")
pprint(lookup_list)
print("list with id, age, weight:")
pprint(data_list)

for k,v in data_list.items():
    if k not in lookup_list:
        lookup_list[k] = {'name':''}
    lookup_list[k].update(v)

print("Result of merge:")
pprint(lookup_list)

Output:

list with id and name:
{'123': {'id': '123', 'name': 'Robin'},
 '221': {'id': '221', 'name': 'Larry'},
 '331': {'id': '331', 'name': 'Wilson'},
 '412': {'id': '412', 'name': 'Jack'}}
list with id, age, weight:
{'110': {'age': '14', 'id': '110', 'weight': '150'},
 '123': {'age': '47', 'id': '123', 'weight': '320'},
 '221': {'age': '47', 'id': '221', 'weight': '190'},
 '331': {'age': '25', 'id': '331', 'weight': '255'},
 '412': {'age': '21', 'id': '412', 'weight': '180'}}
Result of merge:
{'110': {'age': '14', 'id': '110', 'name': '', 'weight': '150'},
 '123': {'age': '47', 'id': '123', 'name': 'Robin', 'weight': '320'},
 '221': {'age': '47', 'id': '221', 'name': 'Larry', 'weight': '190'},
 '331': {'age': '25', 'id': '331', 'name': 'Wilson', 'weight': '255'},
 '412': {'age': '21', 'id': '412', 'name': 'Jack', 'weight': '180'}}

FOLLOWUP

Just to further the discussion, here's how it would be done in sqlite. I suppose each individual needs to evaluate whether this is better or not.

import csv
from pprint import pprint
import sqlite3

db = sqlite3.connect(":memory:")
db.execute( 'CREATE TABLE lookup (id int, name text);' )
db.execute( 'CREATE TABLE data (id int, age int, weight int);' )

def read_csv(db, table, path):
    cur = db.cursor()
    header = []
    for line in open(path).readlines():
        if not header:
            header = line.rstrip()
            continue
        record = line.strip().split(",")
        sql = f"INSERT INTO {table} ({header}) VALUES ("
        sql  = ','.join(['?']*len(record))   ");"
        cur.execute(sql, record)

lookup_list = read_csv(db, "lookup", "xxx.csv")
data_list = read_csv(db, "data", "yyy.csv")

cur = db.cursor()
for row in cur.execute(
    "SELECT data.id,lookup.name,data.age,data.weight FROM data LEFT JOIN lookup ON lookup.id = data.id;"):
    print(row)

Output:

(123, 'Robin', 47, 320)
(221, 'Larry', 47, 190)
(331, 'Wilson', 25, 255)
(412, 'Jack', 21, 180)
(110, None, 14, 150)

CodePudding user response:

sqlite3 is included with the standard Python distribution.

You can just create an in-memory database, put the csv contents into tables, and do an actual left join.

See this answer for creating an sqlite database from csv Importing a CSV file into a sqlite3 database table using Python

create your tables using the method shown in that answer. Let's say you have called your tables t_lookup and t_data, and your database connection is called conn1.

cursor = conn1.cursor()
cursor.execute('''
SELECT t1.*, t2.name
FROM
t_data t1
LEFT JOIN
t_lookup t2
ON t1.id = t2.id;''')

left_result = cursor.fetchall()

for row in left_result:
    print(row)

conn1.close()
  • Related