eli5.sklearn.unhashing — ELI5 0.15.0 documentation (original) (raw)

""" Utilities to reverse transformation done by FeatureHasher or HashingVectorizer. """ from collections import defaultdict, Counter from itertools import chain from typing import Iterable, Union

import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import ( HashingVectorizer, FeatureHasher, ) from sklearn.pipeline import FeatureUnion

from eli5._feature_names import FeatureNames

[docs] class InvertableHashingVectorizer(BaseEstimator, TransformerMixin): """ A wrapper for HashingVectorizer which allows to get meaningful feature names. Create it with an existing HashingVectorizer instance as an argument::

    vec = InvertableHashingVectorizer(my_hashing_vectorizer)

Unlike HashingVectorizer it can be fit. During fitting
:class:`~.InvertableHashingVectorizer` learns which input terms map to
which feature columns/signs; this allows to provide more meaningful
:meth:`get_feature_names_out`. The cost is that it is no longer stateless.

You can fit :class:`~.InvertableHashingVectorizer` on a random sample
of documents (not necessarily on the whole training and testing data),
and use it to inspect an existing HashingVectorizer instance.

If several features hash to the same value, they are ordered by
their frequency in documents that were used to fit the vectorizer.

:meth:`transform` works the same as HashingVectorizer.transform.
"""
def __init__(self, vec: HashingVectorizer, unkn_template="FEATURE[%d]"):
    self.vec = vec
    self.unkn_template = unkn_template
    self.unhasher = FeatureUnhasher(
        hasher=vec._get_hasher(),
        unkn_template=unkn_template,
    )
    self.n_features: int = vec.n_features

[docs] def fit(self, X, y=None): """ Extract possible terms from documents """ self.unhasher.fit(self._get_terms_iter(X)) return self

def partial_fit(self, X):
    self.unhasher.partial_fit(self._get_terms_iter(X))
    return self

def transform(self, X):
    return self.vec.transform(X)

[docs] def get_feature_names_out(self, always_signed=True) -> FeatureNames: """ Return feature names. This is a best-effort function which tries to reconstruct feature names based on what it has seen so far.

    HashingVectorizer uses a signed hash function. If always_signed is True,
    each term in feature names is prepended with its sign. If it is False,
    signs are only shown in case of possible collisions of different sign.

    You probably want always_signed=True if you're checking
    unprocessed classifier coefficients, and always_signed=False
    if you've taken care of :attr:`column_signs_`.
    """
    return self.unhasher.get_feature_names_out(
        always_signed=always_signed,
        always_positive=self._always_positive(),
    )


def _get_terms_iter(self, X):
    analyze = self.vec.build_analyzer()
    return chain.from_iterable(analyze(doc) for doc in X)

@property
def column_signs_(self):
    """
    Return a numpy array with expected signs of features.
    Values are

    * +1 when all known terms which map to the column have positive sign;
    * -1 when all known terms which map to the column have negative sign;
    * ``nan`` when there are both positive and negative known terms
      for this column, or when there is no known term which maps to this
      column.
    """
    if self._always_positive():
        return np.ones(self.n_features)
    self.unhasher.recalculate_attributes()
    return self.unhasher.column_signs_

def _always_positive(self) -> bool:
    return (
        self.vec.binary
        or getattr(self.vec, 'non_negative', False)
        or not getattr(self.vec, 'alternate_sign', True)
    )

[docs] class FeatureUnhasher(BaseEstimator): """ Class for recovering a mapping used by FeatureHasher. """ def init(self, hasher: FeatureHasher, unkn_template="FEATURE[%d]"): if hasher.input_type != 'string': raise ValueError("FeatureUnhasher only supports hashers with " "input_type 'string', got %r." % hasher.input_type) self.hasher = hasher self.n_features: int = self.hasher.n_features self.unkn_template = unkn_template self._attributes_dirty = True self._term_counts: Counter[str] = Counter()

def fit(self, X: Iterable[str], y=None) -> 'FeatureUnhasher':
    self._term_counts.clear()
    self.partial_fit(X, y)
    self.recalculate_attributes(force=True)
    return self

def partial_fit(self, X: Iterable[str], y=None) -> 'FeatureUnhasher':
    self._term_counts.update(X)
    self._attributes_dirty = True
    return self

def get_feature_names_out(self, always_signed=True, always_positive=False) -> FeatureNames:
    self.recalculate_attributes()

    # lists of names with signs of known features
    column_ids, term_names, term_signs = self._get_collision_info()
    feature_names = {}
    for col_id, names, signs in zip(column_ids, term_names, term_signs):
        if always_positive:
            feature_names[col_id] = [{'name': name, 'sign': 1}
                                     for name in names]
        else:
            if not always_signed and _invert_signs(signs):
                signs = -signs
            feature_names[col_id] = [{'name': name, 'sign': sign}
                                     for name, sign in zip(names, signs)]
    return FeatureNames(
        feature_names,
        n_features=self.n_features,
        unkn_template=self.unkn_template)

[docs] def recalculate_attributes(self, force=False): """ Update all computed attributes. It is only needed if you need to access computed attributes after :meth:patrial_fit was called. """ if not self.attributes_dirty and not force: return terms = np.array([term for term, _ in self.term_counts.most_common()]) if len(terms): indices, signs = get_indices_and_signs(self.hasher, terms) else: indices, signs = np.array([]), np.array([]) self.terms: np.ndarray = terms self.term_columns = indices self.term_signs = signs self.collisions_ = get_collisions(indices) self.column_signs = self._get_column_signs() self._attributes_dirty = False

def _get_column_signs(self):
    colums_signs = np.ones(self.n_features) * np.nan
    for hash_id, term_ids in self.collisions_.items():
        term_signs = self.term_signs_[term_ids]
        if _invert_signs(term_signs):
            colums_signs[hash_id] = -1
        elif (term_signs > 0).all():
            colums_signs[hash_id] = 1
    return colums_signs

def _get_collision_info(self) -> tuple[list[int], list[np.ndarray], list[np.ndarray]]:
    column_ids, term_names, term_signs = [], [], []
    for column_id, _term_ids in self.collisions_.items():
        column_ids.append(column_id)
        term_names.append(self.terms_[_term_ids])
        term_signs.append(self.term_signs_[_term_ids])
    return column_ids, term_names, term_signs

def _get_collisions(indices) -> dict[int, list[int]]: """ Return a dict {column_id: [possible term ids]} with collision information. """ collisions: dict[int, list[int]] = defaultdict(list) for term_id, hash_id in enumerate(indices): collisions[hash_id].append(term_id) return dict(collisions)

def _get_indices_and_signs(hasher, terms): """ For each term from terms return its column index and sign, as assigned by FeatureHasher hasher. """ X = _transform_terms(hasher, terms) indices = X.nonzero()[1] signs = X.sum(axis=1).A.ravel() return indices, signs

def _transform_terms(hasher, terms): return hasher.transform(np.array(terms).reshape(-1, 1))

def _invert_signs(signs): """ Shall we invert signs? Invert if first (most probable) term is negative. """ return signs[0] < 0

def is_invhashing(vec): return isinstance(vec, InvertableHashingVectorizer)

[docs] def handle_hashing_vec(vec, feature_names, coef_scale, with_coef_scale=True): """ Return feature_names and coef_scale (if with_coef_scale is True), calling .get_feature_names_out for invhashing vectorizers. """ needs_coef_scale = with_coef_scale and coef_scale is None if is_invhashing(vec): if feature_names is None: feature_names = vec.get_feature_names_out(always_signed=False) if needs_coef_scale: coef_scale = vec.column_signs_ elif (isinstance(vec, FeatureUnion) and any(is_invhashing(v) for _, v in vec.transformer_list) and (needs_coef_scale or feature_names is None)): _feature_names, _coef_scale = _invhashing_union_feature_names_scale(vec) if feature_names is None: feature_names = _feature_names if needs_coef_scale: coef_scale = _coef_scale return (feature_names, coef_scale) if with_coef_scale else feature_names

def invhashing_union_feature_names_scale( vec_union: FeatureUnion) -> tuple[FeatureNames, np.ndarray]: feature_names_store: dict[int, Union[str, list]] = {} unkn_template = None shift = 0 coef_scale_values = [] for vec_name, vec in vec_union.transformer_list: if isinstance(vec, InvertableHashingVectorizer): vec_feature_names = vec.get_feature_names_out(always_signed=False) unkn_template = vec_feature_names.unkn_template for idx, fs in vec_feature_names.feature_names.items(): new_fs = [] for f in fs: new_f = dict(f) new_f['name'] = '{}__{}'.format(vec_name, f['name']) new_fs.append(new_f) feature_names_store[idx + shift] = new_fs coef_scale_values.append((shift, vec.column_signs)) shift += vec_feature_names.n_features else: vec_feature_names = vec.get_feature_names_out() feature_names_store.update( (shift + idx, '{}__{}'.format(vec_name, fname)) for idx, fname in enumerate(vec_feature_names)) shift += len(vec_feature_names) n_features = shift feature_names = FeatureNames( feature_names=feature_names_store, n_features=n_features, unkn_template=unkn_template) coef_scale = np.ones(n_features) * np.nan for idx, values in coef_scale_values: coef_scale[idx: idx + len(values)] = values return feature_names, coef_scale

[docs] def invert_hashing_and_fit( vec: Union[FeatureUnion, HashingVectorizer], docs, ) -> Union[FeatureUnion, InvertableHashingVectorizer]: """ Create an :class:~.InvertableHashingVectorizer from hashing vectorizer vec and fit it on docs. If vec is a FeatureUnion, do it for all hashing vectorizers in the union. Return an :class:~.InvertableHashingVectorizer, or a FeatureUnion, or an unchanged vectorizer. """ if isinstance(vec, HashingVectorizer): vec = InvertableHashingVectorizer(vec) vec.fit(docs) elif (isinstance(vec, FeatureUnion) and any(isinstance(v, HashingVectorizer) for _, v in vec.transformer_list)): vec = _fit_invhashing_union(vec, docs) return vec

def _fit_invhashing_union(vec_union: FeatureUnion, docs) -> FeatureUnion: """ Fit InvertableHashingVectorizer on doc inside a FeatureUnion. """ return FeatureUnion( [(name, invert_hashing_and_fit(v, docs)) for name, v in vec_union.transformer_list], transformer_weights=vec_union.transformer_weights, n_jobs=vec_union.n_jobs)