I have a basic script that compares two CSV files and records the changes in a new one (daily-diff.csv):
CSV (import 1):
Address | Price |
---|---|
101 Main Street | £50,000 |
102 Main Street | £100,000 |
CSV (import 2):
Address | Price |
---|---|
101 Main Street | £55,000 |
102 Main Street | £100,000 |
103 Main Street | £200,000 |
with open('2022-10-04.csv', 'r') as csv1, open('2022-10-05.csv', 'r') as csv2:
import1 = csv1.readlines()
import2 = csv2.readlines()
with open('daily-diff.csv', 'w') as outFile:
for row in import2:
if row not in import1:
outFile.write(row)
Outcome:
101 Main Street | £55,000 |
103 Main Street | £200,000 |
There are a few glaring issues but what I'd most appreciate advice on is the by being more specific in what is returned to daily-diff.csv
:
- Ideally it would omit new or deleted listings to focus on price changes
- It would have an extra column so that the: address, old price & current price are reflected
Desired CSV Outcome:
Address | Old Price | New Price |
---|---|---|
101 Main Street | £50,000 | 55,000 |
CodePudding user response:
This is the sort of thing that I find works best/easiest in database. Using Python builtin database sqlite3
.
cat import_1.csv
Address,Price
101 Main Street,"£50,000"
102 Main Street,"£100,000"
cat import_2.csv
Address,Price
101 Main Street,"£55,000"
102 Main Street,"£100,000"
103 Main Street,"£200,000"
import csv
import sqlite3
con_sqlite = sqlite3.connect(":memory:")
cur_sqlite = con_sqlite.cursor()
cur_sqlite.execute("create table import_1 (address varchar, price varchar)")
cur_sqlite.execute("create table import_2 (address varchar, price varchar)")
with open("import_1.csv") as csv_1:
reader = csv.reader(csv_1)
next(reader)
for row in reader:
cur_sqlite.execute("insert into import_1 values(?, ?)", row)
with open("import_2.csv") as csv_2:
reader = csv.reader(csv_2)
next(reader)
for row in reader:
cur_sqlite.execute("insert into import_2 values(?, ?)", row)
with open('daily-diff.csv', 'w') as out_file:
cur_sqlite.execute("select i1.address, i1.price AS old_price, i2.price AS new_price from import_1 AS i1 join import_2 AS i2 on i1.address = i2.address where i1.price != i2.price")
diff_list = cur_sqlite.fetchall()
writer = csv.writer(out_file)
writer.writerow(["Address", "Old Price", "New Price"])
writer.writerows(diff_list)
cat daily-diff.csv
Address,Old Price,New Price
101 Main Street,"£50,000","£55,000"
CodePudding user response:
Here is one working example of how you could do this with just standard library tools:
import csv
from pathlib import Path
from typing import Any, TypeAlias
AnyDict: TypeAlias = dict[str, Any]
FIRST_COLUMN_NAME = "Address"
def load_dict_from_csv(file_path: Path, **reader_kwargs: Any) -> AnyDict:
"""
Reads CSV file and returns contents as a dictionary.
Keys are the contents of the first column,
values are lists of contents in the remaining columns.
"""
with file_path.open("r") as file:
reader = csv.reader(file, **reader_kwargs)
return {row[0]: row[1:] for row in reader}
def dict_intersect_diff(dict_1: AnyDict, dict_2: AnyDict) -> AnyDict:
"""
Returns the difference of two dictionaries as a new dictionary.
Only keeps identical keys. (Discards keys not present in both.)
"""
common_keys = tuple(key for key in dict_1.keys() if key in dict_2.keys())
return {
key: (dict_1[key], dict_2[key])
for key in common_keys
if dict_1[key] != dict_2[key]
}
def write_dict_diff_to_csv(
dict_diff: AnyDict,
title_row: list[str],
file_path: Path,
**writer_kwargs: Any,
) -> None:
"""
Writes a "difference dictionary" to a csv file.
Creates two columns for each column in the title row,
one for the "old" values and one for the "new" values.
"""
diff_title = [title_row[0]]
for col in title_row[1:]:
diff_title.append(f"{col} old")
diff_title.append(f"{col} new")
with file_path.open("w") as file:
writer = csv.writer(file, **writer_kwargs)
writer.writerow(diff_title)
for key, (values_old, values_new) in dict_diff.items():
writer.writerow([key] values_old values_new)
def main() -> None:
file_path_old = Path("01.csv")
file_path_new = Path("02.csv")
file_path_diff = Path("diff.csv")
dict_old = load_dict_from_csv(file_path_old)
dict_new = load_dict_from_csv(file_path_new)
title_row = [FIRST_COLUMN_NAME] dict_old[FIRST_COLUMN_NAME]
dict_diff = dict_intersect_diff(dict_old, dict_new)
write_dict_diff_to_csv(dict_diff, title_row, file_path_diff)
if __name__ == '__main__':
main()
Running this with your example CSV input files results in the desired outcome.
NOTE: This assumes that the first column has unique values (i.e. there are no two rows with the same "Address" in one CSV file.
This also works with additional columns (aside from "Price").
It is probably advisable to use specific tools for such purposes like Pandas (as suggested in the comments), if you'll be doing this "at scale". But for this simple use case, something like the script above should be fine.
Hope this helps.