I'm parsing invoice PDF files to JSON and they have field called 'invoice_date'
. In one invoice date can be provided as '03 Apr 2022'
and in the other one - '2022-09-23'
, for instance. I want my date to be in 'YYYY-MM-DD' format always, like in the second example.
I'm trying to do this check:
if isdate(invoice_json_formatted['invoice_date']) and not bool(datetime.strptime(invoice_json_formatted['invoice_date'], "%Y-%m-%d")):
datetime.strptime(invoice_json_formatted['invoice_date'], "%d %b %Y").strftime("%Y-%d-%m")
and it works for 'YYYY-MM-DD' format but not for 'DD-MM-YYYY'. The problem is that my code crashes in the second part of my if statement as datetime.strptime(invoice_json_formatted['invoice_date'], "%Y-%m-%d")
throws an error if provided format does not match with what I am checking.
Here is the full code responsible for parsing:
from dateutil.parser import parse
from datetime import datetime
import fitz
import json
import re
class InvoiceEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (bytes, bytearray)):
return obj.decode('ISO-8859-1')
return json.JSONEncoder.default(self, obj)
def isint(obj: any) -> bool:
try:
int(obj)
return True
except:
return False
def isdate(value: str) -> bool:
try:
parse(value, fuzzy=False)
return True
except ValueError:
return False
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i:i n]
# def chrange(start: str, stop: str) -> list[chr]:
# return [chr(n) for n in rage(ord(stop), ord(step) 1)]
def parse_invoide_pdf() -> list:
# How can we parse PDF with Fitz library?
# First we need to read the file (INV-000399)
document = fitz.open("Downloads/INV-000399.pdf")
# Then get needed page to parse
page = document.load_page(0)
# Then get text data from that page
# and return its content as Python dictionary
text = page.get_text('dict')
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(text, f, indent=4, cls=InvoiceEncoder) # convert dict to JSON
prepared_data = [] # extracted data from data.json file to be put into dictionary
# Then I want to clean up my JSON a little.
# to get rid of PDF layout data (tables positioning, their width etc).
# e.g. {"number": 0, bbox": [38.40968,...], ...}
# so that my JSON would be {"BILL FROM:": "TestCorp", "INVOICE #:": "01",...}.
# So we loop over our dictionary section called "blocks"
# (in that way we can omit total width/height data) that isn't needed.
for i in range(len(text['blocks'])):
if "image" in text['blocks'][i]:
continue
for j in range(len(text['blocks'][i]['lines'])):
for k in range(len(text['blocks'][i]['lines'][j]['spans'])):
prepared_data.append(text['blocks'][i]['lines'][j]['spans'][k]['text'])
for i in range(len(prepared_data)):
if 'TAX' in prepared_data[i]:
continue
if ',' in prepared_data[i]:
prepared_data[i] = prepared_data[i].replace(',', '.')
if '.' in prepared_data[i][:-3] :
prepared_data[i] = prepared_data[i][:-3].replace(',', '')
return prepared_data
prepared_data = parse_invoide_pdf()
def get_invoice_items() -> list:
r_top = re.compile("(Amount*|Line*)")
top_border = prepared_data.index(list(filter(r_top.match, prepared_data))[0]) 1
r_bot = re.compile("(Notes*|Powered*|^Notes*)")
bottom_border = prepared_data.index(list(filter(r_bot.match, prepared_data))[0]) \
if list(filter(r_bot.match, prepared_data)) else None
items = prepared_data[top_border:bottom_border]
if isint(items[0]):
items.remove(items[0])
return items
def generate_items_json_keys() -> list:
items_json_keys = []
item_desc_col_title = re.compile("(Item*|Description*)", re.IGNORECASE)
items_json_keys.append(list(filter(item_desc_col_title.match, prepared_data)))
item_quantity_col_title = re.compile("((\w )quantity*|qty*|quantity*)", re.IGNORECASE)
items_json_keys.append(list(filter(item_quantity_col_title.match, prepared_data)))
item_price_col_title = re.compile("((\w \s )cost*|price*|product*|rate*)", re.IGNORECASE)
items_json_keys.append(list(filter(item_price_col_title.match, prepared_data)))
total_col_title = re.compile(r"(?=line|amount$).*", re.IGNORECASE)
items_json_keys.append(list(filter(total_col_title.match, prepared_data)))
for k in range(len(items_json_keys)):
if len(items_json_keys[k]) == 1:
items_json_keys[k][0] = items_json_keys[k][0].lower()
for i in range(len(items_json_keys[k])):
items_json_keys[k][i] = items_json_keys[k][i].lower()
return items_json_keys
def generate_items_json_data() -> list:
items_json = []
items_splitted = list(chunks(get_invoice_items(), 5))
for i in range(len(items_splitted)):
items_json.append(dict(zip(sum(generate_items_json_keys(), []), items_splitted[i])))
for dictionary in items_json:
for k, v in dictionary.items():
if '$' in v:
v = v[1:]
dictionary.update([(k, v)])
return items_json
def generate_info_json_data() -> dict:
top_border = prepared_data[0]
r = re.compile(r"(item$|#)", flags=re.IGNORECASE)
info = prepared_data[0:prepared_data.index(list(filter(r.match, prepared_data))[0])]
# if the 1st field is page number, remove it
if isint(info[0]):
info.remove(info[0])
for i in range(len(info)):
# remove $ sign from all fields if exists
# so then the resulting JSON could be
# parsed in apex.
if '$' in info[i]:
info[i] = info[i].replace('$', '')
if not isdate(info[i]) and ' ' in info[i]:
info[i] = info[i].replace(' ', '_')
if '_:' in info[i]:
info[i] = info[i].replace('_:', '')
# remove empty fields
if (len(info[i].strip())) == 0:
del info[i]
break
# if amount of fields is odd
if len(info) % 2 != 0:
# add empty field to make it even so then we could convert our list to dict.
info.append('')
info_json_data = {info[i]: info[i 1] for i in range(0, len(info), 2)}
return info_json_data
def compose_invoice_json() -> dict:
invoice_json = generate_info_json_data()
invoice_json['items'] = generate_items_json_data()
invoice_json_formatted = dict((k.lower(), v) for k, v in invoice_json.items())
if isdate(invoice_json_formatted['invoice_date']) and not bool(datetime.strptime(invoice_json_formatted['invoice_date'], "%Y-%m-%d")):
datetime.strptime(invoice_json_formatted['invoice_date'], "%d %b %Y").strftime("%Y-%d-%m")
return invoice_json_formatted
print(f"RESULT ============ {compose_invoice_json()}")
Can I avoid providing format somehow? Or is there any better way to convert date formats?
CodePudding user response:
The dateutil
library has a very powerful parser.
from datetime import datetime
from dateutil import parser
def convert(date: str) -> str:
dt = parser.parse(date)
return datetime.strftime(dt, "%Y-%m-%d")
convert("03 Apr 2022") # "2022-04-03"
convert("2022-09-23") # "2022-09-23"
CodePudding user response:
Use exceptions? like:
try:
result = datetime.strptime(invoice_json_formatted['invoice_date'], "%d %b %Y")
except ValueError:
result = datetime.strptime(invoice_json_formatted['invoice_date'], "%Y-%m-%d")
return result.strftime("%Y-%d-%m")