I need to extract all email addresses from all text files within a directory with a lot of subdirectories. It is to much work to do this manually. I wrote the python script below to automate this task. However, when I execute the script I end up with an empty array printed. No errors shown. Can one please indicate what I'm doing wrong
# Import Module
import os
import re
# Folder Path
path = "pat to the root directory"
# Change the directory
os.chdir(path)
#create list and index to add the emails
new_list = []
idx = 0
# I create a method to add all email address from within the subdirectories to add
them to an array
def read_text_file(file_path):
with open(file_path, 'r') as f:
emails = re.findall(r"[a-z0-9\.\- _] @[a-z0-9\.\- _] \.[a-z] ", str(f))
new_list.insert(idx, emails)
idx 1
# iterate through all file and call the method from above
for file in os.listdir():
# Check whether file is in text format or not
if file.endswith(".txt"):
p = f"{path}\{file}"
# call read text file function
read_text_file(p)
#print the array
print (new_list)
CodePudding user response:
to check subdirectories as said you want, you need to check if the current item in the os.listdir()
list is a folder and if so check all the file in that folder
(and if there are more folders in that folder check them as well) or a file
that ends with .txt
you also need to read()
the file (f.read()
) and only then you can pass it to re.findall()
# Import Module
import os
import re
# Folder Path
PATH = r"path to folder" # constants are UPPER CASED LETTERS
# create list to add the emails
new_list = []
def read_text_file(file_path):
global new_list
with open(file_path, 'r') as f:
emails = re.findall(r"[a-z0-9.\- _] @[a-z0-9.\- _] \.[a-z] ", str(f.read()))
new_list = emails
def find_all_text_files(path):
# iterate through all file and call the method from above
path = path if path.endswith("\\") else path "\\"
for file_or_dir in os.listdir(path):
# Check whether file is in text format or not
if os.path.isfile(path file_or_dir) and file_or_dir.endswith(".txt"):
file_path = path file_or_dir
# call read text file function
read_text_file(file_path)
# if the current item is dir
elif os.path.isdir(path file_or_dir):
new_path = path file_or_dir
find_all_text_files(new_path)
def main():
global new_list
find_all_text_files(PATH)
# print the array
print(new_list)
if __name__ == '__main__':
main()
CodePudding user response:
When your code doesn't produce expected results, you need to debug it to find the cause. Is it your regex? Is it your list.insert()
? Is it the conversion of the file handle f
to a string directly Something else entirely? Let's find out by modifying your read_text_file()
func to print status:
def read_text_file(file_path):
print(f"attempting to parse {file_path}")
with open(file_path, 'r') as f:
huge_line = f.read()
emails = re.findall(r"[a-z0-9.\- _] @[a-z0-9.\- _] \.[a-z] ", huge_line)
print(f"found {len(emails)} emails in file {file_path}")
new_list.extend(emails)
I dropped the idx
var and just used list.extend()
. Try that out and see where it fails, then add more print statements as needed to narrow it down.