Find important keyword in a list of strings-CodePudding

I have a function that organizes files in a particular directory on the basis of its name, Basically what the function does is split the files in the directory based on their name then create folders with that name, then all the files containing that name will be moved into that folder. For example if there are two files wave.png and wave-edited.png it will create a folder named wave and because those two files contain the keyword wave they will be moved into that folder. I am stuck figuring out how to get the keyword

List of file_names = ['ghosts-edited.png', 'ghosts.png', 'wave.png', 'wave-edited.png', '10-14-day', '12-11-day']

Expected output

['ghosts', 'wave', 'day']

Code:

def name_category():
    sub_file_names = []
    file_names = []
    delimiters = ['.', ',', '!', ' ', '-', ';', '?', '*', '!', '@', '#', '$', '%', '^', '&', '(', ')', '_', '/', '|', '<', '>']
    try:
        for filename in os.listdir(folder_to_track):
            filename = filename.lower()
            file_names.append(filename)
            sub_file_names.append(max(re.findall(r'[A-Za-z] ',filename),key=len)) # I want to replace this method
            sub_file_names = list(set(sub_file_names))
        file_mappings = collections.defaultdict()
        for filename in os.listdir(folder_to_track):
            if not os.path.isdir(os.path.join(folder_to_track, filename)):
                for sub_file_name in sub_file_names:
                    file_mappings.setdefault(sub_file_name, []).append(filename)

        for folder_name, folder_items in file_mappings.items():
            folder_path = os.path.join(folder_to_track, folder_name)
            if not os.path.exists(folder_path):
                os.mkdir(folder_path)
                
                for filename in file_names:
                    filename = filename.lower()
                    i = 1
                    regexPattern = '|'.join(map(re.escape, delimiters))
                    splittedstring = re.split(regexPattern, filename, 0)
                    if folder_name in splittedstring:
                        new_name = filename
                        file_exits = os.path.isfile(folder_path   '\\'   new_name)
                        while file_exits:
                            i  = 1
                            new_name = os.path.splitext(folder_to_track   '\\'   new_name)[0]   str(i)   os.path.splitext(folder_to_track   '\\'   new_name)[1]   
                            new_name = new_name.split("\\")[4]
                            file_exits = os.path.isfile(folder_path   "\\"   new_name)
                        src = folder_to_track   "\\"   filename
                        new_name = folder_path   "\\"   new_name
                        os.rename(src, new_name)
        
    except Exception as e:
        print(e)

sub_file_names when printed:

['ghosts', 'wave', 'edited']

Right now I am filtering the keywords using the biggest word in the filename called sub_file_name.

CodePudding user response：

IIUC, you would like a method that could obtain the names before the first non-word character.

Code

def find_prefixes(strings):
    '''
       Finds common prefix in strings before non-alphanumeric character
    '''
    prefixes = set()                       # set of prefixes
    pattern = re.compile('[^a-zA-Z0-9]')   # pattern to detect non-alphanumeric character
                         
    for string in strings:
            # Add first non-integer list
            arr = pattern.split(string)  # single split on non-letter character
            for prefix in arr:
                if not prefix.isdigit(): # first that is not a number
                    prefixes.add(prefix) # add to prefix
                    break
    return list(prefixes)

Test

file_names = ['10-12-day', '12-11-day', 'ghosts-edited.png', 'ghosts.png', 'wave.png', 'wave-edited.png']    
print(find_prefixes(file_names))
# Output: ['wave', 'ghosts', 'day']

CodePudding user response：

You can create a dictionary where keys will be desired folder names and values will be lists with filenames. For example:

names = [
    "ghosts-edited.png",
    "ghosts.png",
    "wave.png",
    "wave-edited.png",
    "another.png",
    "just-edited.png",
]

out = {}
for n in names:
    if "-edited" in n:
        out.setdefault(n.rsplit("-", maxsplit=1)[0], []).append(n)
    else:
        out.setdefault(n.split(".")[0], []).append(n)

print(out)

Prints:

{
    "ghosts": ["ghosts-edited.png", "ghosts.png"],
    "wave": ["wave.png", "wave-edited.png"],
    "another": ["another.png"],
    "just": ["just-edited.png"],
}