I have the following code that I use to generate a list of common company suffixes below:
import re
from cleanco import typesources,
import string
def generate_common_suffixes():
unique_items = []
company_suffixes_raw = typesources()
for item in company_suffixes_raw:
for i in item:
if i.lower() not in unique_items:
unique_items.append(i.lower())
unique_items.extend(['holding'])
return unique_items
I'm then trying to use the following code to remove those suffixes from a list of company names
company_name = ['SAMSUNG ÊLECTRONICS Holding, LTD', 'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC', 'Global Credit
Investments','Seatown', 'Sona Asset Management']
suffixes = generate_common_suffixes()
cleaned_names = []
for company in company_name:
for suffix in suffixes:
new = re.sub(r'\b{}\b'.format(re.escape(suffix)), '', company)
cleaned_names.append(new)
I keep getting a list of unchanged company names despite knowing that the suffixes are there.
Alternate Attempt
I've also tried an alternate method where I'd look for the word and replace it without regex,
but i couldn't figure out why it was removing parts of the company name itself - for example, it would remove the first 3 letters in Samsung
for word in common_words:
name = name.replace(word, "")
Any help is greatly appreciated!
CodePudding user response:
import unicodedata
from cleanco import basename
import re
company_names = ['SAMSUNG ÊLECTRONICS Holding, LTD',
'Apple inc',
'FIIG Securities Limited Asset Management Arm',
'First Eagle Alternative Credit, LLC',
'Global Credit Investments',
'Seatown',
'Sona Asset Management']
suffix = ["holding"] # "Common words"? You can add more
cleaned_names = []
for company_name in company_names:
# To Lower
company_name = company_name.lower()
# Fix unicode
company_name = unicodedata.normalize('NFKD', company_name).encode('ASCII', 'ignore').decode()
# Remove punctuation
company_name = re.sub(r'[^\w\s]', '', company_name)
# Remove suffixes
company_name = basename(company_name)
# Remove common words
for word in suffix:
company_name = re.sub(fr"\b{word}\b", '', company_name)
# Save
cleaned_names.append(company_name)
print(cleaned_names)
Ouput:
['samsung aalectronics ', 'apple', 'fiig securities limited asset management arm', 'first eagle alternative credit', 'global credit investments', 'seatown', 'sona asset management']