Below is a snippet of code I had been working on a few months back but only now is it needed. I believe the main part of it is some code I ammended from a SO post but I lost the URL. Eitherway, I had forgotten how slow it is when hundreds of thousands of files are involved so I am looking into methods of making it faster.
I've tried moving parts of the code around and ommitting certain sections, but performance either stays the same of gets worse which leads me to believe the issue is in the os.listdir command. From what I have read os.listdir is the fastest option here as it doesn't perform as many system calls as scandir or walk, but its performance is still sad with folders exceeding 100000 files as referenced below.
14387 files in 2794 folders processed in 5.88s
14387 files in 2794 folders processed in 3.224s
14387 files in 2794 folders processed in 5.847s
110016 files in 21440 folders processed in 22.732s
110016 files in 21440 folders processed in 22.603s
110016 files in 21440 folders processed in 41.055s
249714 files in 35707 folders processed in 66.452s
249714 files in 35707 folders processed in 49.154s
249714 files in 35707 folders processed in 88.43s
249714 files in 35707 folders processed in 48.942s
I am currently looking into another way of indexing the file/folder locations using a static text file that would be prepopulated on the server every hour with the latest folder contents, but before I give up on the below code, I thought to ask for assistance as to whether the code can be made faster or is it operating at its limit.
import tkinter as tk
import tkinter.ttk as ttk
from ttkwidgets import CheckboxTreeview
import os
import time
time_start = time.time()
iid = 1 # IID of tree item. 0 is top level parent
count_folders = 0 # Number of folders in parent
count_files = 0 # Number of files in parent
compare_check = {} # Build the dictionary with IID key and folder/file paths in list
root = tk.Tk()
root.geometry('850x450')
style = ttk.Style(root)
v_scrollbar = tk.Scrollbar(root, orient='vertical')
v_scrollbar.place(x=830, y=20, width=20, height=415)
tree = CheckboxTreeview(root, show='tree', yscrollcommand=v_scrollbar.set)
tree.place(x=10, y=20, anchor="nw", width=815, height=415)
v_scrollbar.config(command=tree.yview)
style.configure('Treeview', indent=15)
def new_folder(parent_path, directory_entries, parent_iid):
global iid, count_folders, count_files
for name in directory_entries:
item_path = parent_path os.sep name
if os.path.isdir(item_path):
subdir_iid = tree.insert(parent=parent_iid, index='end', text=f'[F] {name}')
try:
subdir_entries = os.listdir(item_path)
new_folder(parent_path=item_path, directory_entries=subdir_entries, parent_iid=subdir_iid)
count_folders = 1 # for testing
except PermissionError:
pass
else:
tree.insert(parent=parent_iid, index='end', text=f'[f] {name}')
count_files = 1 # for testing
# The iid of the tree item is returned as hex value
iid = 1
hex_iid = hex(iid)
hex_of_folder_file = str(hex_iid)[2:].upper() # Omit the 0x of the hex value
hex_compare = hex_of_folder_file
# For the external app searching function we need to prefix the given iid hex value with an 'I'
if len(hex_compare) >= 3:
hex_compare = 'I' str(hex_of_folder_file)
elif len(hex_compare) == 2:
hex_compare = 'I0' str(hex_of_folder_file)
elif len(hex_compare) == 1:
hex_compare = 'I00' str(hex_of_folder_file)
iid = int(hex_iid, 16) # Convert back to decimal to continue the iid increment count
compare_check.update({hex_compare: [parent_path, parent_path[14:], name]}) # Update dictionary with current item
parent_iid = tree.insert(parent='', index='0', text='All Documents', open=True)
start_path = os.path.expanduser(r"K:/DMC Processed - 02072017") # Path for test
start_dir_entries = os.listdir(start_path)
new_folder(parent_path=start_path, directory_entries=start_dir_entries, parent_iid=parent_iid)
time_end = time.time()
time_total = round(time_end - time_start, 3) # for testing. Simple start to end timer result
ttk.Label(root, text=f"Files: {count_files} || Folders: {count_folders} || Time: {time_total}s", font='arial 10 bold').place(x=300, y=0) # for testing
print(f"{count_files} files in {count_folders} folders processed in {time_total}s") # for testing
root.mainloop()
CodePudding user response:
Since you nicely set it up with timing I thought it'd be fun challenge to give this a try.
I tried rewriting it to use os.walk, but I had a thought that your os.path.isdir()
call would be incredibly slow, so I switched that out with scandir
. Turns out that's the fastest way I could find.
Benchmarks:
original: 697665 files in 76729 folders processed in 106.079s
os.scandir: 697665 files in 76729 folders processed in 23.152s
os.walk: 697665 files in 76731 folders processed in 32.869s
Using the scandir
module didn't seem to make much difference, seems Python has optimised os
quite nicely now.
Here's your code with the other functions:
import tkinter as tk
import tkinter.ttk as ttk
from ttkwidgets import CheckboxTreeview
import os
import scandir
import time
time_start = time.time()
iid = 1 # IID of tree item. 0 is top level parent
count_folders = 0 # Number of folders in parent
count_files = 0 # Number of files in parent
compare_check = {} # Build the dictionary with IID key and folder/file paths in list
root = tk.Tk()
root.geometry('850x450')
style = ttk.Style(root)
v_scrollbar = tk.Scrollbar(root, orient='vertical')
v_scrollbar.place(x=830, y=20, width=20, height=415)
tree = CheckboxTreeview(root, show='tree', yscrollcommand=v_scrollbar.set)
tree.place(x=10, y=20, anchor="nw", width=815, height=415)
v_scrollbar.config(command=tree.yview)
style.configure('Treeview', indent=15)
def new_folder(parent_path, directory_entries, parent_iid):
global iid, count_folders, count_files
for name in directory_entries:
item_path = parent_path os.sep name
if os.path.isdir(item_path):
subdir_iid = tree.insert(parent=parent_iid, index='end', text=f'[F] {name}')
try:
subdir_entries = os.listdir(item_path)
new_folder(parent_path=item_path, directory_entries=subdir_entries, parent_iid=subdir_iid)
count_folders = 1 # for testing
except PermissionError:
pass
else:
tree.insert(parent=parent_iid, index='end', text=f'[f] {name}')
count_files = 1 # for testing
# The iid of the tree item is returned as hex value
iid = 1
hex_iid = hex(iid)
hex_of_folder_file = str(hex_iid)[2:].upper() # Omit the 0x of the hex value
hex_compare = hex_of_folder_file
# For the external app searching function we need to prefix the given iid hex value with an 'I'
if len(hex_compare) >= 3:
hex_compare = 'I' str(hex_of_folder_file)
elif len(hex_compare) == 2:
hex_compare = 'I0' str(hex_of_folder_file)
elif len(hex_compare) == 1:
hex_compare = 'I00' str(hex_of_folder_file)
iid = int(hex_iid, 16) # Convert back to decimal to continue the iid increment count
compare_check.update({hex_compare: [parent_path, parent_path[14:], name]}) # Update dictionary with current item
def new_folder_scandir(parent_path, parent_iid):
global iid, count_folders, count_files
for name in os.scandir(parent_path):
if name.is_dir():
subdir_iid = tree.insert(parent=parent_iid, index='end', text=f'[F] {name}')
try:
new_folder_scandir(parent_path=name.path, parent_iid=subdir_iid)
count_folders = 1 # for testing
except PermissionError:
pass
else:
tree.insert(parent=parent_iid, index='end', text=f'[f] {name}')
count_files = 1 # for testing
# The iid of the tree item is returned as hex value
iid = 1
hex_iid = hex(iid)
hex_of_folder_file = str(hex_iid)[2:].upper() # Omit the 0x of the hex value
hex_compare = hex_of_folder_file
# For the external app searching function we need to prefix the given iid hex value with an 'I'
if len(hex_compare) >= 3:
hex_compare = 'I' str(hex_of_folder_file)
elif len(hex_compare) == 2:
hex_compare = 'I0' str(hex_of_folder_file)
elif len(hex_compare) == 1:
hex_compare = 'I00' str(hex_of_folder_file)
iid = int(hex_iid, 16) # Convert back to decimal to continue the iid increment count
compare_check.update({hex_compare: [parent_path, parent_path[14:], name]}) # Update dictionary with current item
def new_folder_walk(path):
global count_folders, count_files
def hex_thing(parent_path, name):
global iid
# The iid of the tree item is returned as hex value
iid = 1
hex_iid = hex(iid)
hex_of_folder_file = str(hex_iid)[2:].upper() # Omit the 0x of the hex value
hex_compare = hex_of_folder_file
# For the external app searching function we need to prefix the given iid hex value with an 'I'
if len(hex_compare) >= 3:
hex_compare = 'I' str(hex_of_folder_file)
elif len(hex_compare) == 2:
hex_compare = 'I0' str(hex_of_folder_file)
elif len(hex_compare) == 1:
hex_compare = 'I00' str(hex_of_folder_file)
iid = int(hex_iid, 16) # Convert back to decimal to continue the iid increment count
compare_check.update({hex_compare: [parent_path, parent_path[14:], name]}) # Update dictionary with current item
tree_items = {path: tree.insert(parent='', index='0', text='All Documents', open=True)}
for root, dirs, files in scandir.walk(path):
for dir in dirs:
path = os.path.join(root, dir)
count_folders = 1
tree_items[path] = tree.insert(parent=tree_items[root], index='end', text=f'[F] {dir}')
hex_thing(root, dir)
for file in files:
path = os.path.join(root, file)
count_files = 1
tree.insert(parent=tree_items[root], index='end', text=f'[f] {file}')
hex_thing(root, file)
start_path = os.path.expanduser(r"C:/Program Files") # Path for test
# 0 = original, 1 = scandir, 2 = walk
run = 1
if run == 0:
parent_iid = tree.insert(parent='', index='0', text='All Documents', open=True)
start_dir_entries = os.listdir(start_path)
new_folder(parent_path=start_path, directory_entries=start_dir_entries, parent_iid=parent_iid)
elif run == 1:
parent_iid = tree.insert(parent='', index='0', text='All Documents', open=True)
new_folder_scandir(parent_path=start_path, parent_iid=parent_iid)
elif run == 2:
new_folder_walk(start_path)
time_end = time.time()
time_total = round(time_end - time_start, 3) # for testing. Simple start to end timer result
ttk.Label(root, text=f"Files: {count_files} || Folders: {count_folders} || Time: {time_total}s", font='arial 10 bold').place(x=300, y=0) # for testing
print(f"{count_files} files in {count_folders} folders processed in {time_total}s") # for testing
root.mainloop()
For the record I'm actually surprised that os.walk
is slower than os.scandir
even when iterating through every file.