Home > front end >  Remove all duplicate entries from lists within dictionary except for the first entry (Python)
Remove all duplicate entries from lists within dictionary except for the first entry (Python)

Time:01-08

Given:

dict_books = {
     'key_biography': ['All quiet on the western front', 'My life'],
     'key_fiction': ['All quiet on the western front', 'Lord of the Rings'],
     'key_fairytales':['Lord of the Rings']
     }

Please remove all duplicate values from entire dictionary except the original entry and return something like this:

dict_books = {
    'key_biography': ['All quiet on the western front', 'My life'],
    'key_fiction': ['Lord of the Rings'],
    'key_fairytales':[]
    }

EDIT

I narrowed the problem down as much as I felt capable. You can find the github link (including needed .txt files) here: https://github.com/kormuch/python_file_organizer/tree/main/file organizer v1.15

to jump to the problem, go to the three ### I wrote the output and what's wrong with it down at the very bottom.

###stackoverflow: duplicate entries from lists within dictionary except for the first entry need to be removed

import os
import sys
import shutil
from datetime import date
from datetime import datetime


#directories with relative paths
dir_source_files = r'source files'
dir_destination = r'destination'
dir_keyword_files = r'keywords'
dir_logfiles = r'logfiles'


def create_folders_and_move_files():
    #now a new directory is created and has the same name as the .txt file
    scanned_dir_keyword_files = os.scandir(dir_keyword_files)
    for file in scanned_dir_keyword_files: #creates new dirs named after .txt files in destination folder
        print((f".txt file found in dir_keyword_files:\n{file} name of new folder: "   os.path.splitext(file.name)[0]))
        new_dir_name = str(os.path.splitext(file.name)[0])
        path_for_new_dir = os.path.join(dir_destination, new_dir_name)

        try:
            os.makedirs(path_for_new_dir, exist_ok = False)
            print(f"directory creation succesful. Created directory: {new_dir_name}\n")
        except OSError as error:
            print(f"directory creation failed. '{new_dir_name}' already exists\n")    
    
    #creating a clean keywordlist, finding keywords in sourcefiles, moving sourcefiles when matched
    for txt_file_with_keywords in os.scandir(dir_keyword_files):
        print(f"    TASK 1: iterating through parent txt-keywordfile:\n {txt_file_with_keywords} ")
        keywordlist_a = []
        with open(txt_file_with_keywords) as txt_full_with_hashtags:
            for keyword_with_hashtags in txt_full_with_hashtags.readlines():
                keywords_without_hashtags = keyword_with_hashtags.rstrip().split('#') #automatically creates list and removes the hashtag from every keyword
                keywords_without_hashtags.remove('') #removes empty entries from list
            print(f"    TASK 2: fill child keywordlist_a with keywords from parent:") 
            for keyword_without_hashtag in keywords_without_hashtags:
                keywordlist_a.append(keyword_without_hashtag)
                print('keyword added to keywordlist_a: '   keyword_without_hashtag)
            print("keywordlist_a ready:")
            print(keywordlist_a)
                
                #### this is where the fun begins:
            for scanned_dir_destination in os.scandir(dir_destination):
                print("This is the destination path "   str(os.path.realpath(scanned_dir_destination)))                    
                for keyword_a in keywordlist_a:
                        for scanned_scource_file in os.scandir(dir_source_files):
                            print(f"    TASK 3: comparing keyword '{keyword_a}' to '{scanned_scource_file}'")
                            if keyword_a.lower() in str(scanned_scource_file).lower():
                                print(f"Bingo! '{keyword_a}' found in '{scanned_scource_file}'")
                                try:
                                    shutil.move(scanned_scource_file, scanned_dir_destination)
                                    print(f"    TASK 4: moving {scanned_scource_file} to {scanned_dir_destination}")
                                except:
                                 pass
            print("\n")            
 
                
def extract_files_from_subfolder_in_source_files():
    for possible_dir in os.scandir(dir_source_files):
        if possible_dir.is_dir():
            subfolder_path_in_source_files = os.listdir(dir_source_files   "/"   possible_dir.name)
            print(f"Subfolder found in source folder: {possible_dir}.")
            for subfolder_file in subfolder_path_in_source_files:
                shutil.move(os.path.join(possible_dir.path, subfolder_file), dir_source_files)
                print(f"FILE: {subfolder_file} extracted from {possible_dir} and moved to {dir_source_files}.")
    print("All files scanned for subfolders")


def delete_empty_folder_in_source_files():
    for a_possibly_empty_folder in os.scandir(dir_source_files):
        print(a_possibly_empty_folder)
        if a_possibly_empty_folder.is_dir():
            print(f"{a_possibly_empty_folder} is dir")
            subfolder_path = os.listdir(dir_source_files   "/"   a_possibly_empty_folder.name)
            if len(subfolder_path) == 0:
                print(f"EMPTY FOLDER FOUND \n The folder {a_possibly_empty_folder} is empty and will be deleted.\n")
                shutil.rmtree(a_possibly_empty_folder)
                

#extract_files_from_subfolder_in_source_files()
#create_folders_and_move_files()
#delete_empty_folder_in_source_files()

def create_logfile():
    current_date = datetime.now()
    dt_string = current_date.strftime("%Y%m%d%H%M%S")
    print(f"creating logfile: {dt_string}.txt")
    try:
        logfile = open(f"{dir_logfiles}\logfile {dt_string}.txt", "x ")
    except OSError as error:
        print(f"logfile creation failed. logfile already exists\n")
        pass
    
    
def give_dict_destinationFolders_and_listOf_srcFiles():
    dict_destinationFolder_and_listOf_srcFiles = {}
    for txt_file_with_keywords in os.scandir(dir_keyword_files): #creates new dirs named after .txt files in destination folder
        list_value_listOf_srcFiles=[]
        new_dir_name = str(os.path.splitext(txt_file_with_keywords.name)[0])
        keywordlist_a = []
        with open(txt_file_with_keywords) as txt_full_with_hashtags:
            for keyword_with_hashtags in txt_full_with_hashtags.readlines():
                keywords_without_hashtags = keyword_with_hashtags.rstrip().split('#') #automatically creates list and removes the hashtag from every keyword
                keywords_without_hashtags.remove('') #removes empty entries from list
            for keyword_without_hashtag in keywords_without_hashtags:
                keywordlist_a.append(keyword_without_hashtag)
               
        #### this is where the fun begins:
        for keyword_a in keywordlist_a:
            for scanned_scource_file in os.scandir(dir_source_files):
                if keyword_a.lower() in str(scanned_scource_file).lower():
                    #appending source file titles to a list. the list will become values in the dict "dict_destinationName_and_listOf_srcFiles"
                    list_value_listOf_srcFiles.append(str((os.path.splitext(scanned_scource_file.name)[0])) str((os.path.splitext(scanned_scource_file.name)[1])))
                    list(dict.fromkeys(list_value_listOf_srcFiles))#removing double entries
        dict_destinationFolder_and_listOf_srcFiles[new_dir_name]=list_value_listOf_srcFiles
    return(dict_destinationFolder_and_listOf_srcFiles)




###stackoverflow: duplicate entries from lists within dictionary except for the first entry need to be removed
def giveNames_of_destinationFolders_and_srcFiles():
    for destination_folder, list_srcFiles in give_dict_destinationFolders_and_listOf_srcFiles().items():
        print(f"Following files will be moved to destination folder '{destination_folder}':")
        for src_file in list_srcFiles:
            print(f"    - {src_file}")
        print("\n")
            

giveNames_of_destinationFolders_and_srcFiles()  

OUTPUT:

    Following files will be moved to destination folder 'audiobooks':
        - How to Read People Like a Book -James W. Williams -Full Audiobook (192kbit_AAC).m4a.txt
        - How to Talk to Anyone 92 Little Tricks for Big Success in Relationships Audiobook (128kbit_AAC).m4a.txt
        - lord of the rings.txt

Following files will be moved to destination folder 'movies':
    - lord of the rings.txt

My program says that "lord of the rings.txt" will be moved twice. Which cannot happen. It'll be gone before that happen. But I need to display what happens because the printed message will be what the user sees before running the program.

CodePudding user response:

You need 2 things:

  1. something to hold the values you have seen. i will use a set, but you can use a list too.
seen = set()
  1. and a new dict for the output
output = {}

And then loop through your dict, add new entries to seen and skip the ones already there

for key, books in dict_books.items():
    unseen_books = []
    for book in books:
        if book in seen:
            continue
        seen.add(book)
        unseen_books.append(book)
    output[key] = unseen_books
  • Related