Source code for b4msa.lang_dependency

# Copyright 2016 Sabino Miranda-Jiménez and Daniela Moctezuma
# with collaborations of Eric S. Tellez

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# -*- coding: utf-8 -*-

import io
import re
import os
from nltk.stem.snowball import SnowballStemmer
from b4msa.params import OPTION_NONE
from nltk.stem.porter import PorterStemmer
idModule = "language_dependency"


PATH = os.path.join(os.path.dirname(__file__), 'resources')


_HASHTAG = '#'
_USERTAG = '@'
_sURL_TAG = '_url'
_sUSER_TAG = '_usr'
_sHASH_TAG = '_htag'
_sNUM_TAG = '_num'
_sDATE_TAG = '_date'
_sENTITY_TAG = '_ent'
_sNEGATIVE = "_neg"
_sPOSITIVE = "_pos"
_sNEUTRAL = "_neu"


[docs]def get_lang(l): """Convert language abbr to full names""" l = l.strip().lower() h = dict(es='spanish', en='english', ar='arabic', it='italian', de='german', zh='chinese') return h.get(l, l)
[docs]class LangDependencyError(Exception): def __init__(self, message): self.message = message def __str__(self): return repr(self.message)
[docs]class LangDependency(): """ Defines a set of functions to change text using laguage dependent transformations, e.g., - Negation - Stemming - Stopwords :param lang: Language spanish | english | italian | german :type lang: str """ STOPWORDS_CACHE = {} NEG_STOPWORDS_CACHE = {} def __init__(self, lang="spanish"): """ Initializes the parameters for specific language """ self.languages = ["spanish", "english", "italian", "german", "arabic"] self.lang = lang if self.lang not in SnowballStemmer.languages and self.lang != 'chinese': raise LangDependencyError("Language not supported for stemming: " + lang) if self.lang == "english": self.stemmer = PorterStemmer() elif self.lang == 'chinese': self.stemmer = None else: self.stemmer = SnowballStemmer(self.lang) @property def lang(self): return self._lang @lang.setter def lang(self, l): self._lang = get_lang(l) @property def neg_stopwords(self): """Stop words skip to find a negation""" try: return self._neg_stopwords except AttributeError: lang = self.lang if self.lang not in self.languages: raise LangDependencyError("Language not supported: " + lang) self._neg_stopwords = LangDependency.NEG_STOPWORDS_CACHE.get(lang, None) if self._neg_stopwords is None: self._neg_stopwords = self.load_stopwords(os.path.join(PATH, "{0}.neg.stopwords".format(lang))) LangDependency.NEG_STOPWORDS_CACHE[lang] = self._neg_stopwords return self._neg_stopwords @property def stopwords(self): """Stop words read from resources directory""" try: return self._stopwords except AttributeError: lang = self.lang if self.lang not in self.languages: raise LangDependencyError("Language not supported: " + lang) self._stopwords = LangDependency.STOPWORDS_CACHE.get(lang, None) if self._stopwords is None: self._stopwords = self.load_stopwords(os.path.join(PATH, "{0}.stopwords".format(lang))) LangDependency.STOPWORDS_CACHE[lang] = self._stopwords stw = dict() for x in self._stopwords: try: ident = len(x) stw[ident][x] = 1 except KeyError: stw[ident] = {x: 1} self._stopwords = stw return self._stopwords
[docs] def load_stopwords(self, fileName): """Load stopwords from file""" if not os.path.isfile(fileName): raise LangDependencyError("File not found: " + fileName) StopWords = [] with io.open(fileName, encoding='utf8') as f: for line in f.readlines(): line = line.strip().lower() if line == "": continue if line.startswith("#"): continue StopWords.append(line) return StopWords
[docs] def stemming(self, text): """Applies the stemming process to `text` parameter""" tokens = re.split(r"~", text.strip()) t = [] for tok in tokens: if re.search(r"^(@|#|_|~)", tok, flags=re.I): t.append(tok) else: t.append(self.stemmer.stem(tok)) return "~".join(t)
[docs] def negation(self, text): """Applies negation process to the given text""" if self.lang not in self.languages: raise LangDependencyError("Negation - language not defined") if self.lang == "spanish": text = self.spanish_negation(text) elif self.lang == "english": text = self.english_negation(text) elif self.lang == "italian": text = self.italian_negation(text) return text
[docs] def spanish_negation(self, text): """ Standarizes negation sentences, nouns are also considering with the operator "sin" (without) Markers like ninguno, ningún, nadie are considered as another word. """ if getattr(self, 'skip_words', None) is None: self.skip_words = "me|te|se|lo|les|le|los" self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords) text = text.replace('~', ' ') tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \ _sHASH_TAG + "|" + \ _sNUM_TAG + "|" + _sNEGATIVE + "|" + \ _sPOSITIVE + "|" + _sNEUTRAL + "|" # unifies negation markers under the "no" marker text = re.sub(r"\b(jam[aá]s|nunca|sin|ni|nada)\b", " no ", text, flags=re.I) # reduces to unique negation marker text = re.sub(r"\b(jam[aá]s|nunca|sin|no|nada)(\s+\1)+", r"\1", text, flags=re.I) p1 = re.compile(r"(?P<neg>((\s+|\b|^)no))(?P<sk_words>(\s+(" + self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" + tags + ")(\s+|\b|$)))", flags=re.I) m = p1.search(text) if m: text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text) # removes isolated marks "no_" if marks appear because of negation rules text = re.sub(r"\b(no_)\b", r" no ", text, flags=re.I) # removes extra spaces because of transformations text = re.sub(r"\s+", r" ", text, flags=re.I) return text.replace(' ', '~')
[docs] def english_negation(self, text): """ Standarizes negation sentences markers used: "not, no, never, nor, neither" "any" is only used with negative sentences. """ if getattr(self, 'skip_words', None) is None: self.skip_words = "me|you|he|she|it|us|the" self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords) text = text.replace('~', ' ') tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \ _sHASH_TAG + "|" + \ _sNUM_TAG + "|" + _sNEGATIVE + "|" + \ _sPOSITIVE + "|" + _sNEUTRAL + "|" # expands contractions of negation text = re.sub(r"\b(ca)n't\b", r"\1n not ", text, flags=re.I) text = re.sub(r"\b(w)on't\b", r"\1ill not ", text, flags=re.I) text = re.sub(r"\b(sha)n't\b", r"\1ll not ", text, flags=re.I) text = re.sub(r"\b(can)not\b", r"\1 not ", text, flags=re.I) text = re.sub(r"\b([a-z]+)(n't)\b", r"\1 not ", text, flags=re.I) # checks negative sentences with the "any" marker and changes "any" to "not" makers pp1 = re.compile(r"(?P<neg>(\bnot\b))(?P<text>(\s+([^\s]+?)\s+)+?)(?P<any>any\b)", flags=re.I) m = pp1.search(text) if m: text = pp1.sub(r"\g<neg> \g<text> not ", text) # unifies negation markers under the "not" marker # markers used: # not, no, never, nor, neither text = re.sub(r"\b(not|no|never|nor|neither)\b", r" not ", text, flags=re.I) text = re.sub(r"\s+", r" ", text, flags=re.I) p1 = re.compile(r"(?P<neg>((\s+|\b|^)not))(?P<sk_words>(\s+(" + self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" + tags + ")(\s+|\b|$)))", flags=re.I) m = p1.search(text) if m: text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text) # removes isolated marks "no_" if marks appear because of negation rules text = re.sub(r"\b(not_)\b", r" not ", text, flags=re.I) text = re.sub(r"\s+", r" ", text, flags=re.I) return text.replace(' ', '~')
def italian_negation(self, text): if getattr(self, 'skip_words', None) is None: self.skip_words = "mi|ti|lo|gli|le|ne|li|glieli|glielo|gliela|gliene|gliele" self.skip_words = self.skip_words + "|" + "|".join(self.neg_stopwords) text = text.replace('~', ' ') tags = _sURL_TAG + "|" + _sUSER_TAG + "|" + _sENTITY_TAG + "|" + \ _sHASH_TAG + "|" + \ _sNUM_TAG + "|" + _sNEGATIVE + "|" + \ _sPOSITIVE + "|" + _sNEUTRAL + "|" # unifies negation markers under the "no" marker text = re.sub(r"\b(mai|senza|non|no|né|ne)\b", " no ", text, flags=re.I) # reduces to unique negation marker text = re.sub(r"\b(mai|senza|non|no|né|ne)(\s+\1)+", r"\1", text, flags=re.I) p1 = re.compile(r"(?P<neg>((\s+|\b|^)no))(?P<sk_words>(\s+(" + self.skip_words + "|" + tags + r"))*)\s+(?P<text>(?!(" + tags + ")(\s+|\b|$)))", flags=re.I) m = p1.search(text) if m: text = p1.sub(r"\g<sk_words> \g<neg>_\g<text>", text) # removes isolated marks "no_" if marks appear because of negation rules text = re.sub(r"\b(no_)\b", r" no ", text, flags=re.I) # removes extra spaces because of transformations text = re.sub(r"\s+", r" ", text, flags=re.I) return text.replace(' ', '~') def filterStopWords(self, text, stopwords_option): if stopwords_option == OPTION_NONE: return text sw = self.stopwords d = text.split('~') R = [] for x in d: try: if x in sw[len(x)]: if stopwords_option == 'delete': continue elif stopwords_option == 'group': x = "_sw" except KeyError: pass R.append(x) return "~".join(R) def transform(self, text, negation=False, stemming=False, stopwords=OPTION_NONE): if negation: text = self.negation(text) if stemming: text = self.stemming(text) text = self.filterStopWords(text, stopwords) return text