Source code for parasolr.indexing

"""
Model-based indexing with Solr.

Items to be indexed in Solr should extend :class:`Indexable`. The
default implementation should work for most Django models; at a minimum
you should extend :meth:`Indexable.index_data` to include the information
to be indexed in Solr. You may also customize :meth:`Indexable.index_item_type`
and :meth:`Indexable.index_item_id`.

To manually index content in Solr, see
:mod:`~parasolr.management.commands.index` manage command documentation.

-------------------------

"""

import itertools
import logging

try:
    from django.db.models.query import QuerySet

    from parasolr.django import SolrClient
except ImportError:
    QuerySet = SolrClient = None


logger = logging.getLogger(__name__)


# recursive subclasses
# via https://stackoverflow.com/questions/3862310/how-to-find-all-the-subclasses-of-a-class-given-its-name
[docs]def all_subclasses(cls):
    """recursive method to find all subclasses"""
    return set(cls.__subclasses__()).union(
        [s for c in cls.__subclasses__() for s in all_subclasses(c)]
    )


[docs]class Indexable:
    """Mixin for objects that are indexed in Solr.  Subclasses must implement
    `index_id` and `index` methods.

    When implementing an Indexable subclass where items_to_index
    returns something like a generator, which does not expose either a
    `count` method or can be counted with `len`, for use with
    the Django index manage command you should
    implement `total_to_index` and return the number of items
    to be indexed.
    """

    # NOTE: current implementation is Django-specific, intended for
    # use with django models. Should be possible to generalize once
    # we have other use cases.

    #: number of items to index at once when indexing a large number of items
    index_chunk_size = 150

    #: solr connection
    solr = None

    #: id separator for auto-generated index ids
    ID_SEPARATOR = "."

    def __init__(self):
        # initialize connection to solr on first instance initialization
        Indexable._init_solr()

    @classmethod
    def _init_solr(cls):
        # store on the class to take advantage of sessions
        if cls.solr is None:
            cls.solr = SolrClient()

[docs]    @classmethod
    def all_indexables(cls):
        """Find all :class:`Indexable` subclasses for indexing. Ignore abstract and
        proxy :class:`Indexable` subclasses such as
        :class:`~parasolr.django.indexing.ModelIndexable`."""
        return [
            subclass
            for subclass in all_subclasses(cls)
            if not hasattr(subclass, "_meta")
            or (
                not getattr(subclass._meta, "abstract", False)
                and not getattr(subclass._meta, "proxy", False)
            )
        ]

[docs]    @classmethod
    def index_item_type(cls):
        """Label for this kind of indexable item. Must be unique
        across all Indexable items in an application. By default, uses
        Django model verbose name. Used in default index id and
        in index manage command."""
        # TODO: move this implementation into django subclass?
        # default could just return an attribute on the class
        return cls._meta.verbose_name

[docs]    @classmethod
    def items_to_index(cls):
        """Get all items to be indexed for a single class of Indexable
        content. Subclasses can override this method to return a custom
        iterable, e.g. a Django `QuerySet` that takes advantage of
        prefetching. By default, returns all Django objects for a model.
        Raises NotImplementedError if that fails."""
        try:
            return cls.objects.all()
        except AttributeError:
            raise NotImplementedError

[docs]    @classmethod
    def total_to_index(cls):
        """Get the total number of items to be indexed for a single class of
        Indexable content. Subclasses should override this method
        if necessary. By default, returns a Django queryset count for a model.
        Raises NotImplementedError if that fails."""
        try:
            return cls.objects.count()
        except AttributeError:
            raise NotImplementedError

[docs]    @classmethod
    def prep_index_chunk(cls, chunk):
        """Optional method for any additional processing on chunks
        of items being indexed. Intended to allow adding prefetching on
        a chunk when iterating on Django QuerySets; since indexing uses Iterator,
        prefetching configured in `items_to_index` is ignored."""
        # default behavior is to do nothing; return chunk unchanged
        return chunk

[docs]    def index_id(self):
        """Solr identifier. By default, combines :meth:`index item_type`
        and :attr:`id` with :attr:ID_SEPARATOR`."""
        return "%s%s%s" % (self.index_item_type(), self.ID_SEPARATOR, self.id)

[docs]    def index_data(self):
        """Dictionary of data to index in Solr for this item.
        Default implementation adds  :meth:`index_id` and
        :meth:`index_item_type`"""
        return {"id": self.index_id(), "item_type_s": self.index_item_type()}

[docs]    def index(self):
        """Index the current object in Solr."""
        self.solr.update.index([self.index_data()])

[docs]    @classmethod
    def index_items(cls, items, progbar=None):
        """Indexable class method to index multiple items at once.  Takes a
        list, queryset, or generator of Indexable items or dictionaries.
        Items are indexed in chunks, based on :attr:`Indexable.index_chunk_size`.

        Args:
            items: list, queryset, or generator of indexable objects or dictionaries
            progbar: optional :class:`progressbar.Progressbar` object to
            update when indexing items in chunks.

        Returns:
            Total number of items indexed
        """

        # make sure solr client is initialized
        Indexable._init_solr()

        # if this is a queryset, use iterator to get it in chunks
        if QuerySet and isinstance(items, QuerySet):
            items = items.iterator()

        # if this is a normal list, convert it to an iterator
        # so we don't iterate the same slice over and over
        elif isinstance(items, list):
            items = iter(items)

        # index in chunks to support efficiently indexing large numbers
        # of items (adapted from index script)
        chunk = list(itertools.islice(items, cls.index_chunk_size))
        count = 0
        while chunk:
            # call index data method if present; otherwise assume item is dict
            chunk = cls.prep_index_chunk(chunk)
            cls.solr.update.index(
                [i.index_data() if hasattr(i, "index_data") else i for i in chunk]
            )
            count += len(chunk)
            # update progress bar if one was passed in
            if progbar:
                progbar.update(count)

            # get the next chunk
            chunk = list(itertools.islice(items, cls.index_chunk_size))

        return count

[docs]    def remove_from_index(self):
        """Remove the current object from Solr by identifier using
        :meth:`index_id`"""
        # NOTE: using quotes on id to handle ids that include colons or other
        # characters that have meaning in Solr/lucene queries
        logger.debug("Deleting document from index with id %s", self.index_id())
        self.solr.update.delete_by_id([self.index_id()])