Home > Back-end >  Problems when applying function with multiple arguments to pandas dataframe
Problems when applying function with multiple arguments to pandas dataframe

Time:09-30

I have a function that checks co-occurring words in a string of text. I would like to use this function in a pandas dataframe so that I can check if there are co-occurrences within sentences of different documents. Unfortunately, the way I passed the function in a pandas dataframe seems to not work properly.

The following code checks if any word in list 'bag1' occurs near ('dist = 4') a word from list 'bag2' in the string called 'sentence'. If there is a co-occurrence, the code prints True. This code works fine.

import re
import itertools
from nltk.tokenize import word_tokenize
sentence = "The plant is growing at a rapid rate. But the beans are growing slowly."
sentence = re.sub('[^A-Za-z0-9] ', ' ', sentence).lstrip().lower()
words = word_tokenize(sentence)

bag1 = ["plant", "beans", "banana", "apple"]                  
bag2 = ["growing", "fast", "fruit"]
dist = 4

def get_distance(lst1, lst2, dist):
   lst1 = [i for i in lst1 if i.lower() in words]
   lst2 = [i for i in lst2 if i.lower() in words]
   combinations = list(itertools.product(lst1, lst2))
   for w1, w2 in combinations:
       if w1 in words and w2 in words:
           w1_indexes = [index for index, value in enumerate(words) if value == w1]    
           w2_indexes = [index for index, value in enumerate(words) if value == w2]    
           distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
           if min(distances) <= dist:
               print(True)
               break
           else:
               print(False)


def main():
   get_distance(bag1, bag2, dist)

main()

Here I am passing the 'get_distance' function to check if any word in list 'bag1' occurs near a word from list 'bag2' in the individual sentences of each document. If there is a co-occurrence, I would like to mark the column 'Match?' as True. Unfortunately, this code does not work properly. The problems seem to start at 'lst1 = [i for i in lst1 if i.lower() in row.tokens]' , which somehow generate and empty list. Even if I remove the line of code, there seems to be some bug with the remaining code (see descriptions below).

import pandas as pd
import re
import itertools
import nltk
from nltk.tokenize import word_tokenize

dataset = {
   "document": ["doc1", 'doc2'], 
   "text": ['The plant is growing at a rapid rate. But the beans are growing slowly.', 'The beans are are growing fast in the region.'], 
}
df = pd.DataFrame(dataset)

bag1 = ["plant", "beans", "banana", "apple"]                  
bag2 = ["growing", "fast", "fruit"]
dist = 4

def clean_text():
   df['text'] = df.text.str.lower()
   df['text'] = df.text.str.replace('\ufeff','')
   df['text'] = df.text.str.strip()
   
def split_sentences():
   global df
   df["sentences"] = df["text"].apply(nltk.sent_tokenize)
   df = df.explode('sentences')
   
def tokenize_words():
   df['words_in_text'] = df['sentences'].apply(word_tokenize)
   
def get_distance(row, lst1, lst2, dist):
   row.tokens = df["words_in_text"]
   # It seems that the two lines below generate empty lists. The rest of the code does not work even if I remove the following two lines  
   lst1 = [i for i in lst1 if i.lower() in row.tokens] 
   lst2 = [i for i in lst2 if i.lower() in row.tokens]
   combinations = list(itertools.product(lst1, lst2))
   for w1, w2 in combinations:
       if w1 in row.tokens and w2 in row.tokens:
           w1_indexes = [index for index, value in enumerate(row.tokens) if value == w1]    
           w2_indexes = [index for index, value in enumerate(row.tokens) if value == w2]    
           distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
          if min(distances) <= dist:
              return True
              break
          else:
              return False

#despite having a return value in the get_distance, the values in the 'Match?' column are all 'none'

def main():
   clean_text()
   split_sentences()  
   tokenize_words()
   df['Match?'] = df.apply(get_distance, args=(bag1, bag2, dist), axis=1)
   display(df)

if __name__ == "__main__":
   main()

Clearly, I am doing something wrong when using the 'get_distance' function in a pandas dataframe. Please, let me know if you can spot the mistakes. Thank you.

CodePudding user response:

I believe that you wanted something closer to

def get_distance(row, lst1, lst2, dist):
    
    row_tokens = row.words_in_text
    # It seems that the two lines below generate empty lists. The rest of the code does not work even if I remove the following two lines  
    lst1 = [i for i in lst1 if i.lower() in row_tokens] 
    lst2 = [i for i in lst2 if i.lower() in row_tokens]
    combinations = list(itertools.product(lst1, lst2))
    for w1, w2 in combinations:
        if w1 in row_tokens and w2 in row_tokens:
            w1_indexes = [index for index, value in enumerate(row_tokens) if value == w1]    
            w2_indexes = [index for index, value in enumerate(row_tokens) if value == w2]    
            distances = [abs(item[0] - item[1]) for item in itertools.product(w1_indexes, w2_indexes)]
            if min(distances) <= dist:
                return True
            else:
                return False

I would also suggest

  1. Avoiding global. If you have to use it, that's a clue that you should redesign your function.
  2. Instead, take the df as arguments to your functions
  3. Therefore, don't refer to the global df at all in your functions
  4. In fact, get rid of a global df all together
  5. And use black to auto-format your code
  6. And use isort to auto-format your imports
  7. And use flake8 to find common errors
import itertools

import nltk
import pandas as pd
from nltk.tokenize import word_tokenize

dataset = {
    "document": ["doc1", "doc2"],
    "text": [
        "The plant is growing at a rapid rate. But the beans are growing slowly.",
        "The beans are are growing fast in the region.",
    ],
}

bag1 = ["plant", "beans", "banana", "apple"]
bag2 = ["growing", "fast", "fruit"]
dist = 4


def clean_text(df):
    df["text"] = df.text.str.lower()
    df["text"] = df.text.str.replace("\ufeff", "")
    df["text"] = df.text.str.strip()


def split_sentences(df):
    df["sentences"] = df["text"].apply(nltk.sent_tokenize)
    return df.explode("sentences")


def tokenize_words(df):
    df["words_in_text"] = df["sentences"].apply(word_tokenize)


def get_distance(row, lst1, lst2, dist):

    # We want the words_in_text for this row, not the whole df
    row_tokens = row.words_in_text

    lst1 = [i for i in lst1 if i.lower() in row_tokens]
    lst2 = [i for i in lst2 if i.lower() in row_tokens]
    combinations = list(itertools.product(lst1, lst2))
    for w1, w2 in combinations:
        if w1 in row_tokens and w2 in row_tokens:
            w1_indexes = [
                index for index, value in enumerate(row_tokens) if value == w1
            ]
            w2_indexes = [
                index for index, value in enumerate(row_tokens) if value == w2
            ]
            distances = [
                abs(item[0] - item[1])
                for item in itertools.product(w1_indexes, w2_indexes)
            ]
            if min(distances) <= dist:
                return True
            else:
                return False


def main():
    my_df = pd.DataFrame(dataset)
    clean_text(my_df)
    my_df = split_sentences(my_df)
    tokenize_words(my_df)
    my_df["Match?"] = my_df.apply(get_distance, args=(bag1, bag2, dist), axis=1)
    print(my_df)


if __name__ == "__main__":
    main()
  • Related