Source code for parasolr.indexing
"""
Model-based indexing with Solr.
Items to be indexed in Solr should extend :class:`Indexable`. The
default implementation should work for most Django models; at a minimum
you should extend :meth:`Indexable.index_data` to include the information
to be indexed in Solr. You may also customize :meth:`Indexable.index_item_type`
and :meth:`Indexable.index_item_id`.
To manually index content in Solr, see
:mod:`~parasolr.management.commands.index` manage command documentation.
-------------------------
"""
import itertools
import logging
try:
from django.db.models.query import QuerySet
from parasolr.django import SolrClient
except ImportError:
QuerySet = SolrClient = None
logger = logging.getLogger(__name__)
# recursive subclasses
# via https://stackoverflow.com/questions/3862310/how-to-find-all-the-subclasses-of-a-class-given-its-name
[docs]def all_subclasses(cls):
"""recursive method to find all subclasses"""
return set(cls.__subclasses__()).union(
[s for c in cls.__subclasses__() for s in all_subclasses(c)]
)
[docs]class Indexable:
"""Mixin for objects that are indexed in Solr. Subclasses must implement
`index_id` and `index` methods.
When implementing an Indexable subclass where items_to_index
returns something like a generator, which does not expose either a
`count` method or can be counted with `len`, for use with
the Django index manage command you should
implement `total_to_index` and return the number of items
to be indexed.
"""
# NOTE: current implementation is Django-specific, intended for
# use with django models. Should be possible to generalize once
# we have other use cases.
#: number of items to index at once when indexing a large number of items
index_chunk_size = 150
#: solr connection
solr = None
#: id separator for auto-generated index ids
ID_SEPARATOR = "."
def __init__(self):
# initialize connection to solr on first instance initialization
Indexable._init_solr()
@classmethod
def _init_solr(cls):
# store on the class to take advantage of sessions
if cls.solr is None:
cls.solr = SolrClient()
[docs] @classmethod
def all_indexables(cls):
"""Find all :class:`Indexable` subclasses for indexing. Ignore abstract and
proxy :class:`Indexable` subclasses such as
:class:`~parasolr.django.indexing.ModelIndexable`."""
return [
subclass
for subclass in all_subclasses(cls)
if not hasattr(subclass, "_meta")
or (
not getattr(subclass._meta, "abstract", False)
and not getattr(subclass._meta, "proxy", False)
)
]
[docs] @classmethod
def index_item_type(cls):
"""Label for this kind of indexable item. Must be unique
across all Indexable items in an application. By default, uses
Django model verbose name. Used in default index id and
in index manage command."""
# TODO: move this implementation into django subclass?
# default could just return an attribute on the class
return cls._meta.verbose_name
[docs] @classmethod
def items_to_index(cls):
"""Get all items to be indexed for a single class of Indexable
content. Subclasses can override this method to return a custom
iterable, e.g. a Django `QuerySet` that takes advantage of
prefetching. By default, returns all Django objects for a model.
Raises NotImplementedError if that fails."""
try:
return cls.objects.all()
except AttributeError:
raise NotImplementedError
[docs] @classmethod
def total_to_index(cls):
"""Get the total number of items to be indexed for a single class of
Indexable content. Subclasses should override this method
if necessary. By default, returns a Django queryset count for a model.
Raises NotImplementedError if that fails."""
try:
return cls.objects.count()
except AttributeError:
raise NotImplementedError
[docs] @classmethod
def prep_index_chunk(cls, chunk):
"""Optional method for any additional processing on chunks
of items being indexed. Intended to allow adding prefetching on
a chunk when iterating on Django QuerySets; since indexing uses Iterator,
prefetching configured in `items_to_index` is ignored."""
# default behavior is to do nothing; return chunk unchanged
return chunk
[docs] def index_id(self):
"""Solr identifier. By default, combines :meth:`index item_type`
and :attr:`id` with :attr:ID_SEPARATOR`."""
return "%s%s%s" % (self.index_item_type(), self.ID_SEPARATOR, self.id)
[docs] def index_data(self):
"""Dictionary of data to index in Solr for this item.
Default implementation adds :meth:`index_id` and
:meth:`index_item_type`"""
return {"id": self.index_id(), "item_type_s": self.index_item_type()}
[docs] def index(self):
"""Index the current object in Solr."""
self.solr.update.index([self.index_data()])
[docs] @classmethod
def index_items(cls, items, progbar=None):
"""Indexable class method to index multiple items at once. Takes a
list, queryset, or generator of Indexable items or dictionaries.
Items are indexed in chunks, based on :attr:`Indexable.index_chunk_size`.
Args:
items: list, queryset, or generator of indexable objects or dictionaries
progbar: optional :class:`progressbar.Progressbar` object to
update when indexing items in chunks.
Returns:
Total number of items indexed
"""
# make sure solr client is initialized
Indexable._init_solr()
# if this is a queryset, use iterator to get it in chunks
if QuerySet and isinstance(items, QuerySet):
items = items.iterator()
# if this is a normal list, convert it to an iterator
# so we don't iterate the same slice over and over
elif isinstance(items, list):
items = iter(items)
# index in chunks to support efficiently indexing large numbers
# of items (adapted from index script)
chunk = list(itertools.islice(items, cls.index_chunk_size))
count = 0
while chunk:
# call index data method if present; otherwise assume item is dict
chunk = cls.prep_index_chunk(chunk)
cls.solr.update.index(
[i.index_data() if hasattr(i, "index_data") else i for i in chunk]
)
count += len(chunk)
# update progress bar if one was passed in
if progbar:
progbar.update(count)
# get the next chunk
chunk = list(itertools.islice(items, cls.index_chunk_size))
return count
[docs] def remove_from_index(self):
"""Remove the current object from Solr by identifier using
:meth:`index_id`"""
# NOTE: using quotes on id to handle ids that include colons or other
# characters that have meaning in Solr/lucene queries
logger.debug("Deleting document from index with id %s", self.index_id())
self.solr.update.delete_by_id([self.index_id()])