Source code for parasolr.query.queryset

"""
Object-oriented approach to Solr searching and filtering modeled
on :class:`django.models.queryset.QuerySet`.  Supports iteration,
slicing, counting, and boolean check to see if a search has results.

Filter, search and sort methods return a new queryset, and can be
chained. For example::

    SolrQuerySet(solrclient).filter(item_type_s='person') \
                            .search(name='hem*') \
                            .order_by('sort_name') \


If you are working with Django you should use
:class:`parasolr.django.SolrQuerySet`,
which will automatically initialize a new :class:`parasolr.django.SolrClient`
if one is not passed in.
"""
from typing import Any, Dict, List, Optional

from parasolr.solr import SolrClient
from parasolr.solr.client import ParasolrDict, QueryResponse


[docs]class SolrQuerySet:
    """A Solr queryset object that allows for object oriented
    searching and filtering of Solr results. Allows search results
    to be pagination using slicing, count, and iteration.

    """

    _result_cache = None
    start = 0
    stop = None
    sort_options = []
    search_qs = []
    filter_qs = []
    field_list = []
    highlight_fields = []
    group_field = None
    facet_field_list = []
    stats_field_list = []
    range_facet_fields = []
    facet_opts = {}
    stats_opts = {}
    highlight_opts = {}
    group_opts = {}
    raw_params = {}

    #: by default, combine search queries with AND
    default_search_operator = "AND"

    #: any value constant
    ANY_VALUE = "[* TO *]"
    #: lookup separator
    LOOKUP_SEP = "__"

    def __init__(self, solr: SolrClient):
        # requires solr client so that this version can be django-agnostic
        self.solr = solr
        # convert search operator into form needed for combining queries
        self._search_op = " %s " % self.default_search_operator

[docs]    def get_response(self, **kwargs) -> List[dict]:
        """
        Query Solr and get the results for the current query and filter
        options. Populates result cache and returns the documents portion
        of the reponse.

        Returns:
            Solr response documents as a list of dictionaries.
        """
        # TODO: can we store the result cache and only retrieve
        # if query options have changed?
        # For now, always query.

        # if cached and no override query args are specified,
        # return existing cached result
        if self._result_cache and not kwargs:
            return self._result_cache

        query_opts = self.query_opts()
        query_opts.update(**kwargs)

        # NOTE: still need to work around Solr default of 10 rows
        # see https://github.com/Princeton-CDH/parasolr/issues/43

        # note that we're caching the result with override options here,
        # which may not always be the right thing to do ...
        self._result_cache = self.solr.query(**query_opts)

        # NOTE: django templates choke on AttrDict because it is
        # callable; using dictionary response instead

        return self._result_cache

[docs]    def get_results(self, **kwargs) -> List[dict]:
        """
        Query Solr and get the results for the current query and filter
        options. Populates result cache and returns the documents portion
        of the reponse.
        (Note that this method is not currently compatible with grouping.)

        Returns:
            Solr response documents as a list of dictionaries.
        """
        # get query response
        response = self.get_response(**kwargs)
        # if there is a query error, result will not be set
        if response:
            # NOTE: should probably handle result doc tranformation on grouped responses.
            # Intentionally applying to .docs instead of .items to trigger
            # an error if anyone attempts to use this on a grouped response
            return [self.get_result_document(doc) for doc in self._result_cache.docs]
        return []

[docs]    def get_result_document(self, doc):
        """Method to transform document results. Default behavior is to
        convert from attrdict to dict."""
        return doc.as_dict()

[docs]    def _set_highlighting_opts(self, query_opts: Dict) -> None:
        """Configure highlighting attributes on query_opts. Modifies
        dictionary directly."""
        if self.highlight_fields:
            query_opts.update({"hl": True, "hl.fl": ",".join(self.highlight_fields)})
            # highlighting options should be added as-is
            # (prefixes added in highlight methods)
            query_opts.update(self.highlight_opts)

[docs]    def _set_group_opts(self, query_opts: Dict) -> None:
        """Configure grouping atrtibutes on query_opts. Modifies dictionary
        directly."""
        if self.group_field:
            query_opts.update({"group": True, "group.field": self.group_field})
            # any other group options can be added as-is
            query_opts.update(self.group_opts)

[docs]    def _set_faceting_opts(self, query_opts: Dict) -> None:
        """Configure faceting attributes directly on query_opts. Modifies
        dictionary directly."""
        if self.facet_field_list or self.range_facet_fields or self.facet_opts:
            query_opts.update(
                {
                    "facet": True,
                    "facet.field": self.facet_field_list,
                    "facet.range": self.range_facet_fields,
                }
            )
            for key, val in self.facet_opts.items():
                # use key as is if it starts with "f."
                # (field-specific facet options); otherwise prepend "facet."
                query_opts[key if key.startswith("f.") else "facet.%s" % key] = val

[docs]    def _set_stats_opts(self, query_opts: Dict) -> None:
        """Configure stats attributes directly on query_opts. Modifies
        dictionary directly."""
        if self.stats_field_list:
            query_opts.update({"stats": True, "stats.field": self.stats_field_list})
            for key, val in self.stats_opts.items():
                # use key as if it starts with stats, otherwise prepend
                query_opts[key if key.startswith("stats") else "stats.%s" % key] = val

[docs]    def query_opts(self) -> Dict[str, str]:
        """Construct query options based on current queryset configuration.
        Includes filter queries, start and rows, sort, and search query.
        """
        query_opts = {
            "start": self.start,
            # filter query
            "fq": self.filter_qs,
            # field list
            "fl": ",".join(self.field_list),
            # main query; if no query is defined, find everything
            "q": self._search_op.join(self.search_qs) or "*:*",
            "sort": ",".join(self.sort_options),
        }

        # use stop if set to limit row numbers
        if self.stop:
            query_opts["rows"] = self.stop - self.start

        # highlighting
        self._set_highlighting_opts(query_opts)

        # grouping
        self._set_group_opts(query_opts)

        # faceting
        self._set_faceting_opts(query_opts)

        # stats
        self._set_stats_opts(query_opts)

        # include any raw query parameters
        query_opts.update(self.raw_params)

        # remove any empty string values
        query_opts = {k: v for k, v in query_opts.items() if v not in ["", []]}

        return query_opts

    def __len__(self) -> int:
        return self.count()

[docs]    def count(self) -> int:
        """Total number of results for the current query"""

        # if result cache is already populated, use it
        if self._result_cache:
            return self._result_cache.numFound

        # otherwise, query with current options but request zero rows
        # and do not populate the result cache
        query_opts = self.query_opts()
        # setting these by dictionary assignment, because conflicting
        # kwargs results in a Python exception
        query_opts["rows"] = 0
        query_opts["facet"] = False
        query_opts["hl"] = False
        result = self.solr.query(**query_opts)
        # if there is a query error, no result is returned
        if result:
            return result.numFound
        # error = no results found
        return 0

[docs]    def get_facets(self) -> Dict[str, Dict]:
        """Return a dictionary of facet information included in the
        Solr response. Includes facet fields, facet ranges, etc. Facet
        field results are returned as an ordered dict of value and count.
        """
        if self._result_cache:
            return self._result_cache.facet_counts

        # since we just want a dictionary of facet fields, don't populate
        # the result cache, no rows needed
        query_opts = self.query_opts()
        query_opts["rows"] = 0
        query_opts["hl"] = False
        # setting these by dictionary assignment, because conflicting
        # kwargs results in a Python exception
        result = self.solr.query(**query_opts)
        if result:
            return result.facet_counts
        return {}

[docs]    def get_stats(self) -> Optional[Dict[str, ParasolrDict]]:
        """Return a dictionary of stats information in Solr format or None
        on error."""
        if self._result_cache:
            return self._result_cache.stats

        # since we just want a dictionary of stats fields, don't populate
        # the result cache, no rows needed
        query_opts = self.query_opts()
        query_opts["rows"] = 0
        query_opts["hl"] = False
        # setting these by dictionary assignment, because conflicting
        # kwargs results in a Python exception
        result = self.solr.query(**query_opts)
        if result:
            return result.stats
        return {}

[docs]    def get_expanded(self) -> Dict[str, Dict]:
        """Return a dictionary of expanded records included in the
        Solr response.
        """
        if not self._result_cache:
            self.get_results()

        return self._result_cache.expanded

[docs]    @staticmethod
    def _lookup_to_filter(key: str, value: Any, tag: str = "") -> str:
        """Convert keyword/value argument, with optional lookups separated by
        ``__``, including: in and exists. Field names should *NOT* include
        double-underscores by convention. Accepts an optional tag argument
        to specify an exclude tag as needed.

            Returns: A propertly formatted Solr query string.
        """
        # check for a lookup separator and split
        lookup = None
        solr_query = ""

        # split once on lookup separator; assumes only one
        split_key = key.split(SolrQuerySet.LOOKUP_SEP, 1)
        if len(split_key) == 1:
            # simple lookup, return key,value pair
            solr_query = "%s:%s" % (key, value)

        else:
            key, lookup = split_key

            # list filter (field__in=[a, b, c])
            if lookup == "in":
                # value is a list, join with OR logic for all values in list,
                # treat '' or None values as flagging an exists query
                not_exists = "" in value or None in value
                value = list(filter(lambda x: x not in ["", None], value))

                # if we have a case where the list was just a single falsy value
                # treat as if __exists=False
                if not value:
                    solr_query = "-%s:%s" % (key, SolrQuerySet.ANY_VALUE)
                # otherwise, field lookup on any value by OR
                else:
                    # FIXME: do we need quotes around strings here?
                    solr_query = "%s:(%s)" % (key, " OR ".join(value))

                    if not_exists:
                        # To search for no value OR specified values,
                        # do a negative lookup that negates a positive lookup
                        # for any value and double-negates a lookup
                        # for the requested values
                        # The final output is something like:
                        # -(item_type_s:[* TO *] OR item_type_s:(book OR periodical))
                        solr_query = "-(%s:%s OR -%s)" % (
                            key,
                            SolrQuerySet.ANY_VALUE,
                            solr_query,
                        )

            # exists=True/False filter
            elif lookup == "exists":
                # query for any value if exists is true; otherwise no value
                solr_query = "%s%s:%s" % (
                    "" if value else "-",
                    key,
                    SolrQuerySet.ANY_VALUE,
                )

            elif lookup == "range":
                start, end = value
                solr_query = "%s:[%s TO %s]" % (key, start or "*", end or "*")

        # format tag for inclusion and add to query if set
        if tag:
            solr_query = "{!tag=%s}%s" % (tag, solr_query)

        return solr_query

[docs]    def filter(self, *args, tag: str = "", **kwargs) -> "SolrQuerySet":
        """
        Return a new SolrQuerySet with Solr filter queries added.
        Multiple filters can be combined either in a single
        method call, or they can be chained for the same effect.
        For example::

            queryset.filter(item_type_s='person').filter(birth_year=1900)
            queryset.filter(item_type_s='person', birth_year=1900)

        A tag may be specified for the filter to be used with facet.field
        exclusions::

            queryset.filter(item_type_s='person', tag='person')

        To provide a filter that should be used unmodified, provide
        the exact string of your filter query::

            queryset.filter('birth_year:[1800 TO *]')

        You can also search for pre-defined using lookups on a field,
        for example::

            queryset.filter(item_type_s__in=['person', 'book'])
            queryset.filter(item_type_s__exists=False)

        Currently supported field lookups:

            * **in** : takes a list of values; supports '' or None to match
              on field not set
            * **exists**: boolean filter to look for any value / no value
            * **range**: range query. Takes a list or tuple of two values
               for the start and end of the range. Either value can
               be unset for an open-ended range (e.g. `year__range=(1800, None)`)

        """
        qs_copy = self._clone()

        # any args are treated as filter queries without modification
        qs_copy.filter_qs.extend(args)
        for key, value in kwargs.items():
            qs_copy.filter_qs.append(self._lookup_to_filter(key, value, tag=tag))
        return qs_copy

[docs]    def facet(self, *args: str, **kwargs) -> "SolrQuerySet":
        """
        Request facets for specified fields. Returns a new SolrQuerySet
        with Solr faceting enabled and facet.field parameter set. Does not
        support ranged faceting.

        Subsequent calls will reset the facet.field to the last set of
        args in the chain.

        For example::

            qs = queryset.facet('person_type', 'age')
            qs = qs.facet('item_type_s')

        would result in ``item_type_s`` being the only facet field.
        """
        qs_copy = self._clone()

        # cast args tuple to list for consistency with other iterable fields
        qs_copy.facet_field_list = list(args)
        # add other kwargs to be prefixed in query_opts
        qs_copy.facet_opts.update(kwargs)

        return qs_copy

[docs]    def stats(self, *args: str, **kwargs) -> "SolrQuerySet":
        """
        Request stats for specified fields. Returns a new SolrQuerySet
        with Solr faceting enabled and stats.field parameter set.

        Subsequent calls will reset the stats.field to the last set of
        args in the chain.

        For example::

            qs = queryset.stats('person_type', 'age')
            qs = qs.stats('account_start_i')

        would result in ``account_start_i`` being the only facet field.

        Any kwargs will be prepended with ``stats.``. You may also pass local
        parameters along with field names, i.e. ``{!ex=filterA}account_start_i``.
        """

        qs_copy = self._clone()
        # cast args tuple to list for consistency with other iterable fields
        qs_copy.stats_field_list = list(args)
        # add other kwargs to be prefixed in query_opts
        qs_copy.stats_opts.update(kwargs)

        return qs_copy

[docs]    def facet_field(self, field: str, exclude: str = "", **kwargs) -> "SolrQuerySet":
        """
        Request faceting for a single field. Returns a new SolrQuerySet
        with Solr faceting enabled and the field added to
        the list of facet fields. Any keyword arguments will be set
        as field-specific facet configurations.

        ``ex`` will specify a related filter query tag to exclude when
        generating counts for the facet.

        """
        qs_copy = self._clone()
        # append exclude tag if specified
        qs_copy.facet_field_list.append(
            "{!ex=%s}%s" % (exclude, field) if exclude else field
        )
        # prefix any keyword args with the field name
        # (facet. prefix added in query_opts)

        qs_copy.facet_opts.update(
            {"f.%s.facet.%s" % (field, opt): value for opt, value in kwargs.items()}
        )

        return qs_copy

[docs]    def facet_range(self, field: str, **kwargs) -> "SolrQuerySet":
        """
        Request range faceting for a single field. Returns a new SolrQuerySet
        with Solr range faceting enabled and the field added to
        the list of facet fields. Keyword arguments such as start, end, and gap
        will be set as field-specific facet configurations.
        """
        # start, end, gap are required by Solr, but we don't actually
        # treat them any differently so it's easier to include as kwargs
        qs_copy = self._clone()
        # add field to list of range facet fields
        qs_copy.range_facet_fields.append(field)

        # configure facet options for this field (start, end, gap)
        qs_copy.facet_opts.update(
            {
                "f.%s.facet.range.%s" % (field, opt): value
                for opt, value in kwargs.items()
            }
        )
        return qs_copy

[docs]    def search(self, *args, **kwargs) -> "SolrQuerySet":
        """
        Return a new SolrQuerySet with search queries added. All
        queries will combined with the default search operator when
        constructing the `q` parameter sent to Solr..
        """
        qs_copy = self._clone()
        # any args are treated as search queries without modification
        qs_copy.search_qs.extend(args)

        for key, value in kwargs.items():
            qs_copy.search_qs.append(self._lookup_to_filter(key, value))

        return qs_copy

[docs]    def order_by(self, *args) -> "SolrQuerySet":
        """Apply sort options to the queryset by field name. If the field
        name starts with -, sort is descending; otherwise ascending."""
        qs_copy = self._clone()
        for sort_option in args:
            if sort_option.startswith("-"):
                sort_order = "desc"
                sort_option = sort_option.lstrip("-")
            else:
                sort_order = "asc"
            qs_copy.sort_options.append("%s %s" % (sort_option, sort_order))

        return qs_copy

[docs]    def query(self, **kwargs) -> "SolrQuerySet":
        """Return a new SolrQuerySet with the results populated from Solr.
        Any options passed in via keyword arguments take precedence
        over query options on the queryset.
        """
        qs_copy = self._clone()
        qs_copy.get_results(**kwargs)
        return qs_copy

[docs]    def only(self, *args, replace=True, **kwargs) -> "SolrQuerySet":
        """Use field limit option to return only the specified fields.
        Optionally provide aliases for them in the return. Subsequent
        calls will *replace* any previous field limits. Example::

            queryset.only('title', 'author', 'date')
            queryset.only('title:title_t', 'date:pubyear_i')

        """
        qs_copy = self._clone()
        # *replace* any existing field list with the current values
        if replace:
            qs_copy.field_list = list(args)
        # unless specified, in which case append
        else:
            qs_copy.field_list.extend(list(args))

        for key, value in kwargs.items():
            qs_copy.field_list.append("%s:%s" % (key, value))

        return qs_copy

[docs]    def also(self, *args, **kwargs) -> "SolrQuerySet":
        """Use field limit option to return the specified fields,
        optionally provide aliases for them in the return. Works
        exactly the same way as :meth:`only` except that it
        does not any previously specified field limits.
        """
        return self.only(*args, replace=False, **kwargs)

[docs]    def highlight(self, field: str, **kwargs) -> "SolrQuerySet":
        """ "Configure highlighting. Takes arbitrary Solr highlight
        parameters and adds the `hl.` prefix to them.  Example use::

            queryset.highlight('content', snippets=3, method='unified')
        """
        qs_copy = self._clone()
        qs_copy.highlight_fields.append(field)
        # make highlight options field-specific to allow for multiple
        qs_copy.highlight_opts.update(
            {"f.%s.hl.%s" % (field, opt): value for opt, value in kwargs.items()}
        )

        return qs_copy

[docs]    def group(self, field: str, **kwargs) -> "SolrQuerySet":
        """ "Configure grouping. Takes arbitrary Solr group
        parameters and adds the `group.` prefix to them.  Example use,
        grouping on a `group_id` field, limiting to three results per group,
        and sorting group members by an `order` field::

            queryset.group('group_id', limit=3, sort='order asc')
        """
        qs_copy = self._clone()
        # store group field and grouping options
        # for now, assuming single group field
        qs_copy.group_field = field
        qs_copy.group_opts.update(
            {"group.%s" % opt: value for opt, value in kwargs.items()}
        )

        return qs_copy

[docs]    def raw_query_parameters(self, **kwargs) -> "SolrQuerySet":
        """Add abritrary raw parameters to be included in the query
        request, e.g. for variables referenced in join or field queries.
        Analogous to the input of the same name in the Solr web interface."""
        qs_copy = self._clone()
        qs_copy.raw_params.update(kwargs)
        return qs_copy

[docs]    def get_highlighting(self) -> Dict[str, Dict[str, List]]:
        """Return the highlighting portion of the Solr response."""
        return self.get_response().highlighting

[docs]    def all(self) -> "SolrQuerySet":
        """Return a new queryset that is a copy of the current one."""
        return self._clone()

[docs]    def none(self) -> "SolrQuerySet":
        """Return an empty result list."""
        qs_copy = self._clone()
        # replace any search queries with this to find not anything
        qs_copy.search_qs = ["NOT *:*"]
        return qs_copy

[docs]    def _clone(self) -> "SolrQuerySet":
        """
        Return a copy of the current QuerySet for modification via
        filters.
        """
        # create a new instance with same solr and query opts
        # use current class to support extending
        qs_copy = self.__class__(solr=self.solr)
        # set attributes that can be copied directly
        qs_copy.start = self.start
        qs_copy.stop = self.stop
        qs_copy.highlight_fields = list(self.highlight_fields)
        qs_copy.group_field = self.group_field

        # set copies of list and dict attributes
        qs_copy.search_qs = list(self.search_qs)
        qs_copy.filter_qs = list(self.filter_qs)
        qs_copy.sort_options = list(self.sort_options)
        qs_copy.field_list = list(self.field_list)
        qs_copy.range_facet_fields = list(self.range_facet_fields)
        qs_copy.highlight_opts = dict(self.highlight_opts)
        qs_copy.group_opts = dict(self.group_opts)
        qs_copy.raw_params = dict(self.raw_params)
        qs_copy.facet_field_list = list(self.facet_field_list)
        qs_copy.facet_opts = dict(self.facet_opts)
        qs_copy.stats_field_list = list(self.stats_field_list)
        qs_copy.stats_opts = dict(self.stats_opts)

        return qs_copy

[docs]    def set_limits(self, start, stop):
        """Set limits to get a subsection of the results, to support slicing."""
        if start is None:
            start = 0
        self.start = start
        self.stop = stop

    iter_chunk_size = 1000

    def __iter__(self):
        """Iterate over result documents for this query."""
        return iter(self.get_results())

    def __bool__(self):
        """results are not empty"""
        return bool(self.get_results())

    def __getitem__(self, k):
        """Return a single result or a slice of results"""
        # based on django queryset logic

        if not isinstance(k, (int, slice)):
            raise TypeError
        assert (not isinstance(k, slice) and (k >= 0)) or (
            isinstance(k, slice)
            and (k.start is None or k.start >= 0)
            and (k.stop is None or k.stop >= 0)
        ), "Negative indexing is not supported."

        # if the result cache is already populated,
        # return the requested index or slice
        if self._result_cache:
            return self._result_cache.items[k]

        qs_copy = self._clone()

        if isinstance(k, slice):
            if k.start is not None:
                start = int(k.start)
            else:
                start = None
            if k.stop is not None:
                stop = int(k.stop)
            else:
                stop = None

            qs_copy.set_limits(start, stop)
            return list(qs_copy)[:: k.step] if k.step else qs_copy

        # single item
        qs_copy.set_limits(k, k + 1)
        return qs_copy.get_results()[0]


# EmptySolrQuerySet instance checking is adapted from Django's solution:
# https://github.com/django/django/blob/master/django/db/models/query.py#L1313-L1325
# see also:
# https://docs.djangoproject.com/en/2.2/ref/models/querysets/#none


class InstanceCheckMeta(type):
    def __instancecheck__(self, instance):
        # allows for SolrQuerySets that are empty to behave as EmptySolrQuerySet
        # checks that queryset is empty using __bool__
        return isinstance(instance, SolrQuerySet) and not instance


[docs]class EmptySolrQuerySet(metaclass=InstanceCheckMeta):
    """
    Marker class that can be used to check if a given queryset is empty via
    :meth:`isinstance`::

        assert isinstance(SolrQuerySet().none(), EmptySolrQuerySet) -> True
        assert isinstance(queryset, EmptySolrQuerySet) # True if empty
    """

    def __init__(self, *args, **kwargs):
        raise TypeError("EmptySolrQuerySet can't be instantiated")