Source code for b4msa.lang_dependency

# Copyright 2016 Sabino Miranda-Jiménez and Daniela Moctezuma
# with collaborations of Eric S. Tellez

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# -*- coding: utf-8 -*-

import io
import re
import os
from nltk.stem.snowball import SnowballStemmer
from b4msa.params import OPTION_NONE
from nltk.stem.porter import PorterStemmer
idModule = "language_dependency"


PATH = os.path.join(os.path.dirname(__file__), 'resources')


_HASHTAG = '#'
_USERTAG = '@'
_sURL_TAG = '_url'
_sUSER_TAG = '_usr'
_sHASH_TAG = '_htag'
_sNUM_TAG = '_num'
_sDATE_TAG = '_date'
_sENTITY_TAG = '_ent'
_sNEGATIVE = "_neg"
_sPOSITIVE = "_pos"
_sNEUTRAL = "_neu"


[docs]def get_lang(l):
    """Convert language abbr to full names"""
    l = l.strip().lower()
    h = dict(es='spanish', en='english',
             ar='arabic', it='italian', 
             de='german', zh='chinese')
    return h.get(l, l)


[docs]class LangDependencyError(Exception):
    def __init__(self, message):
        self.message = message

    def __str__(self):
        return repr(self.message)


[docs]class LangDependency():
    """
    Defines a set of functions to change text using laguage dependent transformations, e.g.,
    - Negation
    - Stemming
    - Stopwords

    :param lang: Language spanish | english | italian | german
    :type lang: str
    """
    STOPWORDS_CACHE = {}
    NEG_STOPWORDS_CACHE = {}

    def __init__(self, lang="spanish"):
        """
        Initializes the parameters for specific language
        """
        self.languages = ["spanish", "english", "italian", "german", "arabic"]
        self.lang = lang

        if self.lang not in SnowballStemmer.languages and self.lang != 'chinese':
            raise LangDependencyError("Language not supported for stemming: " + lang)
        if self.lang == "english":
            self.stemmer = PorterStemmer()
        elif self.lang == 'chinese':
            self.stemmer = None
        else:
            self.stemmer = SnowballStemmer(self.lang)

    @property
    def lang(self):
        return self._lang

    @lang.setter
    def lang(self, l):
        self._lang = get_lang(l)

    @property
    def neg_stopwords(self):
        """Stop words skip to find a negation"""
        try:
            return self._neg_stopwords
        except AttributeError:
            lang = self.lang
            if self.lang not in self.languages:
                raise LangDependencyError("Language not supported: " + lang)
            self._neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None)
            if self._neg_stopwords is None:
                self._neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang)))
                LangDependency.NEG_STOPWORDS_CACHE[lang] = self._neg_stopwords
        return self._neg_stopwords

    @property
    def stopwords(self):
        """Stop words read from resources directory"""
        try:
            return self._stopwords
        except AttributeError:
            lang = self.lang
            if self.lang not in self.languages:
                raise LangDependencyError("Language not supported: " + lang)
            self._stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None)
            if self._stopwords is None:
                self._stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang)))
                LangDependency.STOPWORDS_CACHE[lang] = self._stopwords
        stw = dict()
        for x in self._stopwords:
            try:
                ident = len(x)
                stw[ident][x] = 1
            except KeyError:
                stw[ident] = {x: 1}
        self._stopwords = stw
        return self._stopwords

[docs]    def load_stopwords(self, fileName):
        """Load stopwords from file"""
        if not os.path.isfile(fileName):
            raise LangDependencyError("File not found: " + fileName)

        StopWords = []
        with io.open(fileName, encoding='utf8') as f:
            for line in f.readlines():
                line = line.strip().lower()
                if line == "":
                    continue
                if line.startswith("#"):
                    continue
                StopWords.append(line)

        return StopWords

[docs]    def stemming(self, text):
        """Applies the stemming process to `text` parameter"""

        tokens = re.split(r"~", text.strip())
        t = []
        for tok in tokens:
            if re.search(r"^(@|#|_|~)", tok, flags=re.I):
                t.append(tok)
            else:
                t.append(self.stemmer.stem(tok))
        return "~".join(t)

[docs]    def negation(self, text):
        """Applies negation process to the given text"""
        if self.lang not in self.languages:
            raise LangDependencyError("Negation - language not defined")

        if self.lang == "spanish":
            text = self.spanish_negation(text)
        elif self.lang == "english":
            text = self.english_negation(text)
        elif self.lang == "italian":
            text = self.italian_negation(text)

        return text

[docs]    def spanish_negation(self, text):
        """
        Standarizes negation sentences, nouns are also considering with the operator "sin" (without)
        Markers like ninguno, ningún, nadie are considered as another word.
        """
        if getattr(self, 'skip_words', None) is None:
            self.skip_words = "me|te|se|lo|les|le|los"
            self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords)

        text = text.replace('~', ' ')
        tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \
               _sHASH_TAG + "|" + \
               _sNUM_TAG + "|" + _sNEGATIVE + "|" + \
               _sPOSITIVE + "|" + _sNEUTRAL + "|"

        # unifies negation markers under the "no" marker
        text = re.sub(r"\b(jam[aá]s|nunca|sin|ni|nada)\b", " no ", text, flags=re.I)
        # reduces to unique negation marker
        text = re.sub(r"\b(jam[aá]s|nunca|sin|no|nada)(\s+\1)+", r"\1", text, flags=re.I)
        p1 = re.compile(r"(?P<neg>((\s+|\b|^)no))(?P<sk_words>(\s+(" +
                        self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" +
                        tags + ")(\s+|\b|$)))", flags=re.I)
        m = p1.search(text)
        if m:
            text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text)
        # removes isolated marks "no_" if marks appear because of negation rules
        text = re.sub(r"\b(no_)\b", r" no ", text, flags=re.I)
        # removes extra spaces because of transformations
        text = re.sub(r"\s+", r" ", text, flags=re.I)
        return text.replace(' ', '~')

[docs]    def english_negation(self, text):
        """
        Standarizes negation sentences
        markers used: "not, no, never, nor, neither" "any" is only used with negative sentences.
        """

        if getattr(self, 'skip_words', None) is None:
            self.skip_words = "me|you|he|she|it|us|the"
            self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords)

        text = text.replace('~', ' ')
        tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \
            _sHASH_TAG + "|" + \
            _sNUM_TAG + "|" + _sNEGATIVE + "|" + \
            _sPOSITIVE + "|" + _sNEUTRAL + "|"

        # expands contractions of negation
        text = re.sub(r"\b(ca)n't\b", r"\1n not ", text, flags=re.I)
        text = re.sub(r"\b(w)on't\b", r"\1ill not ", text, flags=re.I)
        text = re.sub(r"\b(sha)n't\b", r"\1ll not ", text, flags=re.I)
        text = re.sub(r"\b(can)not\b", r"\1 not ", text, flags=re.I)
        text = re.sub(r"\b([a-z]+)(n't)\b", r"\1 not ", text, flags=re.I)

        # checks negative sentences with the "any" marker and changes "any" to "not" makers
        pp1 = re.compile(r"(?P<neg>(\bnot\b))(?P<text>(\s+([^\s]+?)\s+)+?)(?P<any>any\b)", flags=re.I)
        m = pp1.search(text)
        if m:
            text = pp1.sub(r"\g<neg> \g<text> not ", text)

        # unifies negation markers under the "not" marker
        # markers used:
        #              not, no, never, nor, neither
        text = re.sub(r"\b(not|no|never|nor|neither)\b", r" not ", text, flags=re.I)
        text = re.sub(r"\s+", r" ", text, flags=re.I)

        p1 = re.compile(r"(?P<neg>((\s+|\b|^)not))(?P<sk_words>(\s+(" +
                        self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" +
                        tags + ")(\s+|\b|$)))", flags=re.I)
        m = p1.search(text)
        if m:
            text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text)
        # removes isolated marks "no_" if marks appear because of negation rules
        text = re.sub(r"\b(not_)\b", r" not ", text, flags=re.I)
        text = re.sub(r"\s+", r" ", text, flags=re.I)
        return text.replace(' ', '~')

    def italian_negation(self, text):
        if getattr(self, 'skip_words', None) is None:
            self.skip_words = "mi|ti|lo|gli|le|ne|li|glieli|glielo|gliela|gliene|gliele"
            self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords)

        text = text.replace('~', ' ')
        tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \
               _sHASH_TAG + "|" + \
               _sNUM_TAG + "|" + _sNEGATIVE + "|" + \
               _sPOSITIVE + "|" + _sNEUTRAL + "|"

        # unifies negation markers under the "no" marker
        text = re.sub(r"\b(mai|senza|non|no|né|ne)\b", " no ", text, flags=re.I)

        # reduces to unique negation marker
        text = re.sub(r"\b(mai|senza|non|no|né|ne)(\s+\1)+", r"\1", text, flags=re.I)

        p1 = re.compile(r"(?P<neg>((\s+|\b|^)no))(?P<sk_words>(\s+(" +
                        self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" +
                        tags + ")(\s+|\b|$)))", flags=re.I)

        m = p1.search(text)

        if m:
            text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text)
        # removes isolated marks "no_" if marks appear because of negation rules
        text = re.sub(r"\b(no_)\b", r" no ", text, flags=re.I)
        # removes extra spaces because of transformations
        text = re.sub(r"\s+", r" ", text, flags=re.I)
        return text.replace(' ', '~')

    def filterStopWords(self, text, stopwords_option):
        if stopwords_option == OPTION_NONE:
            return text
        sw = self.stopwords
        d = text.split('~')
        R = []
        for x in d:
            try:
                if x in sw[len(x)]:
                    if stopwords_option == 'delete':
                        continue
                    elif stopwords_option == 'group':
                        x = "_sw"
            except KeyError:
                pass
            R.append(x)
        return "~".join(R)

    def transform(self, text, negation=False, stemming=False, stopwords=OPTION_NONE):
        if negation:
            text = self.negation(text)

        if stemming:
            text = self.stemming(text)

        text = self.filterStopWords(text, stopwords)
        return text