textblob.formats — TextBlob 0.19.0 documentation (original) (raw)

"""File formats for training and testing data.

Includes a registry of valid file formats. New file formats can be added to the registry like so: ::

from textblob import formats


class PipeDelimitedFormat(formats.DelimitedFormat):
    delimiter = "|"


formats.register("psv", PipeDelimitedFormat)

Once a format has been registered, classifiers will be able to read data files with that format. ::

from textblob.classifiers import NaiveBayesAnalyzer

with open("training_data.psv", "r") as fp:
    cl = NaiveBayesAnalyzer(fp, format="psv")

"""

from future import annotations

import csv import json from collections import OrderedDict

from textblob.utils import is_filelike

DEFAULT_ENCODING = "utf-8"

[docs] class BaseFormat: """Interface for format classes. Individual formats can decide on the composition and meaning of **kwargs.

:param File fp: A file-like object.

.. versionchanged:: 0.9.0
    Constructor receives a file pointer rather than a file path.
"""

def __init__(self, fp, **kwargs):
    pass

[docs] def to_iterable(self): """Return an iterable object from the data.""" raise NotImplementedError('Must implement a "to_iterable" method.')

[docs] @classmethod def detect(cls, stream: str): """Detect the file format given a filename. Return True if a stream is this file format.

    .. versionchanged:: 0.9.0
        Changed from a static method to a class method.
    """
    raise NotImplementedError('Must implement a "detect" class method.')

[docs] class DelimitedFormat(BaseFormat): """A general character-delimited format."""

data: list[list[str]]
delimiter = ","

def __init__(self, fp, **kwargs):
    BaseFormat.__init__(self, fp, **kwargs)
    reader = csv.reader(fp, delimiter=self.delimiter)
    self.data = [row for row in reader]

[docs] def to_iterable(self): """Return an iterable object from the data.""" return self.data

[docs] @classmethod def detect(cls, stream): """Return True if stream is valid.""" try: csv.Sniffer().sniff(stream, delimiters=cls.delimiter) return True except (csv.Error, TypeError): return False

[docs] class CSV(DelimitedFormat): """CSV format. Assumes each row is of the form text,label. ::

    Today is a good day,pos
    I hate this car.,pos
"""

delimiter = ","

[docs] class TSV(DelimitedFormat): """TSV format. Assumes each row is of the form text\tlabel."""

delimiter = "\t"

[docs] class JSON(BaseFormat): """JSON format.

Assumes that JSON is formatted as an array of objects with ``text`` and
``label`` properties.
::

    [
        {"text": "Today is a good day.", "label": "pos"},
        {"text": "I hate this car.", "label": "neg"},
    ]
"""

def __init__(self, fp, **kwargs):
    BaseFormat.__init__(self, fp, **kwargs)
    self.dict = json.load(fp)

[docs] def to_iterable(self): """Return an iterable object from the JSON data.""" return [(d["text"], d["label"]) for d in self.dict]

[docs] @classmethod def detect(cls, stream: str | bytes | bytearray): """Return True if stream is valid JSON.""" try: json.loads(stream) return True except ValueError: return False

_registry = OrderedDict( [ ("csv", CSV), ("json", JSON), ("tsv", TSV), ] )

[docs] def detect(fp, max_read=1024): """Attempt to detect a file's format, trying each of the supported formats. Return the format class that was detected. If no format is detected, return None. """ if not is_filelike(fp): return None for Format in _registry.values(): if Format.detect(fp.read(max_read)): fp.seek(0) return Format fp.seek(0) return None

[docs] def get_registry(): """Return a dictionary of registered formats.""" return _registry

[docs] def register(name, format_class): """Register a new format.

:param str name: The name that will be used to refer to the format, e.g. 'csv'
:param type format_class: The format class to register.
"""
get_registry()[name] = format_class