Source code for acg.language_processing

"""Language processing."""

import re
import string

from kivymd.app import MDApp


def get_nlp(language):
    """Get NLP using spacy."""
    import spacy  # pylint: disable=import-outside-toplevel

    if language not in spacy.info()["Models"]:
        spacy.cli.download(language)
    return spacy.load(language)


try:
    NLP = get_nlp("pt")
except:  # pylint: disable=bare-except
    NLP = None
    print("COULD NOT FIND SPACY MODEL.")


def remove_punctuation(some_string):
    """Return string without punctuation and whitespace."""
    return some_string.strip(string.punctuation + string.whitespace + "”")


def join_lemmas(doc):
    """Return joined lemmas with appropriate whitespace."""
    return "".join(token.lemma_ + token.whitespace_ for token in doc)


def lemma_dict(phrases):
    """Return dictionary with original_phrase: lemmatized_phrase."""
    global NLP  # pylint: disable=global-statement
    if not NLP:
        return {phrase: phrase for phrase in phrases}
    language = getattr(MDApp.get_running_app(), "target_language", None)
    if language and NLP.lang != language:
        NLP = get_nlp(language)
    return {phrase: join_lemmas(NLP(phrase)) for phrase in phrases}


def clean_up(words, remove_punct=True, lower_case=True, lemmatize=True):
    """
    Preprocess a list of words (or phrases).

    Args:
      words: List of words
      remove_punct: If True, removes trailing and leading punctuation. (Default value = True)
      lower_case: If True, converts everything to lower case. (Default value = True)
      lemmatize: If True, tries to convert each word to its dictionary-form. (Default value = True)

    Returns:
        : List of processed words (or phrases).
    """
    if remove_punct:
        words = [word.strip(",.;:-–—!?¿¡\"'") for word in words]
    if lower_case:
        words = [word.lower() for word in words]
    if lemmatize:
        words = list(lemma_dict(words).values())
    return words


[docs]def tag_word_in_sentence(sentence, tag_word): """ Use regex to wrap every derived form of a given ``tag_word`` in ``sentence`` in an html-tag. Args: sentence: String containing of multiple words. tag_word: Word that should be wrapped. Returns: : Sentence with replacements. """ words = sentence.split() words = clean_up(words, lemmatize=False) # get unique, non-empty strings: words = [word for word in set(words) if word] lemmas = clean_up(words, lemmatize=True) tag_lemma = clean_up([tag_word])[0] words_found = [ word for word, lemma in zip(words, lemmas) if lemma == tag_lemma or word == tag_word ] for word in words_found: sentence = re.sub( f"([^>])({word})([^<])", r'\1<span class="word">\2</span>\3', sentence, flags=re.IGNORECASE, ) return sentence