Source code for resumeanalyser.text_cleaning

import nltk
import re
import os


# Download NLTK WordNet and stopwords datasets
try:
    nltk.download("wordnet", quiet=True)
    nltk.download("stopwords", quiet=True)
    nltk.download("punkt", quiet=True)

    from nltk.corpus import wordnet
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize

except Exception as e:
    print(f"An error occurred: {e}")



[docs]
def remove_punctuation(text):
    """
    Remove punctuation and special characters from the input text.

    Parameters:
    text (str): A string containing the text to be processed.

    Returns:
    str: The text with all punctuation and special characters removed.

    Example:
    >>> remove_punctuation("Hello, world!")
    'Hello world'
    """
    cleaned_text = re.sub('[^A-Za-z]', ' ', text)
    cleaned_text = re.sub(' +', ' ', cleaned_text)

    return cleaned_text




[docs]
def tokenize(text):
    """
    Tokenize the input text into individual words.

    Parameters:
    text (str): A string containing the text to be tokenized.

    Returns:
    list: A list of words (tokens) extracted from the input text.

    Example:
    >>> tokenize("Hello, world!")
    ['Hello', ',', 'world', '!']
    """
    return word_tokenize(text)




[docs]
def to_lower(tokens):
    """
    Convert all tokens in the input list to lowercase.

    Parameters:
    tokens (list): A list of tokens (words).

    Returns:
    list: A list of tokens in lowercase.

    Example:
    >>> to_lower(['Hello', 'WORLD'])
    ['hello', 'world']
    """
    return [token.lower() for token in tokens]




[docs]
def remove_stop_words(tokens):
    """
    Remove stop words from the list of tokens.

    Parameters:
    tokens (list): A list of tokens (words).

    Returns:
    list: A list of tokens with stop words removed.

    Example:
    >>> remove_stop_words(['this', 'is', 'a', 'sample'])
    ['sample']
    """
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]




[docs]
def lemmatize(tokens):
    """
    Apply lemmatization to each token in the list.

    Parameters:
    tokens (list): A list of tokens (words).

    Returns:
    list: A list of lemmatized tokens.

    Example:
    >>> lemmatize(['running', 'jumps'])
    ['running', 'jump']
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]




[docs]
def clean_text(text):
    """
    Clean text by applying a series of processing steps: tokenization, converting to lower case,
    removing stop words, and applying lemmatization.

    Parameters:
    text (str): A string containing the text to be cleaned.

    Returns:
    str: The cleaned text as a single string.

    Example:
    >>> clean_text("This is a sample sentence, showing off the stop words filtration.")
    'sample sentence showing stop word filtration'
    """

    text = remove_punctuation(text)
    tokens = tokenize(text)
    tokens = to_lower(tokens)
    tokens = remove_stop_words(tokens)
    tokens = lemmatize(tokens)
    return ' '.join(tokens)