Source code for b4msa.classifier

# Copyright 2016 Ranyart R. Suarez (https://github.com/RanyartRodrigo) and Mario Graff (https://github.com/mgraffg)
# with collaborations of Eric S. Tellez

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sklearn.svm import LinearSVC
# from b4msa.textmodel import TextModel
import numpy as np
from microtc.utils import read_data_labels, read_data, tweet_iterator
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from b4msa.textmodel import TextModel
from multiprocessing import Pool
from scipy.sparse import csr_matrix


[docs]class SVC(object):
    """Classifier

    :param model: TextModel
    :type model: class

    Usage:

    >>> from b4msa.textmodel import TextModel
    >>> from b4msa.classifier import SVC
    >>> corpus = ['buenos dias', 'catedras conacyt', 'categorizacion de texto ingeotec']
    >>> textmodel = TextModel(corpus)
    >>> svc = SVC(textmodel)
    >>> _ = svc.fit([textmodel[x] for x in corpus], [1, 0, 0])
    >>> svc.predict_text('hola')
    0
    """
    def __init__(self, model, **kwargs):
        self.svc = LinearSVC(**kwargs)
        self.model = model

    @property
    def num_terms(self):
        """Dimension which is the number of terms of the corpus

        :rtype: int
        """

        try:
            return self._num_terms
        except AttributeError:
            self._num_terms = None
        return None

[docs]    def tonp(self, X):
        """Sparse representation to sparce matrix

        :param X: Sparse representation of matrix
        :type X: list
        :rtype: csr_matrix
        """

        data = []
        row = []
        col = []
        for r, x in enumerate(X):
            cc = [_[0] for _ in x if np.isfinite(_[1]) and (self.num_terms is None or _[0] < self.num_terms)]
            col += cc
            data += [_[1] for _ in x if np.isfinite(_[1]) and (self.num_terms is None or _[0] < self.num_terms)]
            _ = [r] * len(cc)
            row += _
        if self.num_terms is None:
            _ = csr_matrix((data, (row, col)))
            self._num_terms = _.shape[1]
            return _
        return csr_matrix((data, (row, col)), shape=(len(X), self.num_terms))

[docs]    def fit(self, X, y):
        """Train the classifier

        :param X: inputs - independent variables
        :type X: lst
        :param y: output - dependent variable

        :rtype: instance
        """

        X = self.tonp(X)
        self.le = preprocessing.LabelEncoder()
        self.le.fit(y)
        y = self.le.transform(y)
        if self.num_terms == 0:
            return self
        self.svc.fit(X, y)
        return self

    def decision_function(self, Xnew):
        Xnew = self.tonp(Xnew)
        return self.svc.decision_function(Xnew)

    def predict(self, Xnew):
        if self.num_terms == 0:
            return self.le.inverse_transform(np.zeros(len(Xnew), dtype=np.int))
        Xnew = self.tonp(Xnew)
        ynew = self.svc.predict(Xnew)
        return self.le.inverse_transform(ynew)

    def predict_text(self, text):
        y = self.predict([self.model[text]])
        return y[0]

    def fit_file(self, fname, get_tweet='text',
                 get_klass='klass', maxitems=1e100):
        X, y = read_data_labels(fname, get_klass=get_klass,
                                get_tweet=get_tweet, maxitems=maxitems)
        self.fit([self.model[x] for x in X], y)
        return self

    def predict_file(self, fname, get_tweet='text', maxitems=1e100):
        hy = [self.predict_text(x)
              for x in read_data(fname, get_tweet=get_tweet,
                                 maxitems=maxitems)]
        return hy

    @classmethod
    def predict_kfold(cls, X, y, n_folds=10, seed=0, textModel_params={},
                      kfolds=None, pool=None, use_tqdm=True):
        try:
            from tqdm import tqdm
        except ImportError:
            def tqdm(x, **kwargs):
                return x

        le = preprocessing.LabelEncoder().fit(y)
        y = np.array(le.transform(y))
        hy = np.zeros(len(y), dtype=np.int)
        if kfolds is None:
            kfolds = StratifiedKFold(n_splits=n_folds, shuffle=True,
                                     random_state=seed).split(X, y)
        args = [(X, y, tr, ts, textModel_params) for tr, ts in kfolds]
        if pool is not None:
            if use_tqdm:
                res = [x for x in tqdm(pool.imap_unordered(cls.train_predict_pool, args),
                                       desc='Params', total=len(args))]
            else:
                res = [x for x in pool.imap_unordered(cls.train_predict_pool, args)]
        else:
            if use_tqdm:
                args = tqdm(args)
            res = [cls.train_predict_pool(x) for x in args]
        for ts, _hy in res:
            hy[ts] = _hy
        return le.inverse_transform(hy)

    @classmethod
    def train_predict_pool(cls, args):
        X, y, tr, ts, textModel_params = args
        params = TextModel.params()
        textModel_params = {k: v for k, v in textModel_params.items() if k in params}
        t = TextModel([X[x] for x in tr], **textModel_params)
        m = cls(t).fit([t[X[x]] for x in tr], [y[x] for x in tr])
        return ts, np.array(m.predict([t[X[x]] for x in ts]))

    @classmethod
    def predict_kfold_params(cls, fname, n_folds=10, score=None, numprocs=None, seed=0, param_kwargs={}):
        from b4msa.params import ParameterSelection, Wrapper
        X, y = read_data_labels(fname)
        if numprocs is not None:
            pool = Pool(numprocs)
        else:
            pool = None
            numprocs = 1

        if n_folds % numprocs == 0:
            f = Wrapper(X, y, score, n_folds, cls, pool=pool, seed=seed)
            pool = None
        else:
            f = Wrapper(X, y, score, n_folds, cls, seed=seed)

        return ParameterSelection().search(f.f, pool=pool, **param_kwargs)

    @classmethod
    def fit_from_file(cls, fname, textModel_params={}):
        D = [x for x in tweet_iterator(fname)]
        # X, y = read_data_labels(fname)
        y = [x['klass'] for x in D]
        model = TextModel(D, **textModel_params)
        svc = cls(model)
        return svc.fit([model[x] for x in D], y)