453 lines
16 KiB
Python
453 lines
16 KiB
Python
|
# Copyright (c) 2014 Amazon.com, Inc. or its affiliates.
|
||
|
# All Rights Reserved
|
||
|
#
|
||
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||
|
# copy of this software and associated documentation files (the
|
||
|
# "Software"), to deal in the Software without restriction, including
|
||
|
# without limitation the rights to use, copy, modify, merge, publish, dis-
|
||
|
# tribute, sublicense, and/or sell copies of the Software, and to permit
|
||
|
# persons to whom the Software is furnished to do so, subject to the fol-
|
||
|
# lowing conditions:
|
||
|
#
|
||
|
# The above copyright notice and this permission notice shall be included
|
||
|
# in all copies or substantial portions of the Software.
|
||
|
#
|
||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||
|
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
|
||
|
# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||
|
# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||
|
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||
|
# IN THE SOFTWARE.
|
||
|
#
|
||
|
from math import ceil
|
||
|
from boto.compat import json, map, six
|
||
|
import requests
|
||
|
from boto.cloudsearchdomain.layer1 import CloudSearchDomainConnection
|
||
|
|
||
|
SIMPLE = 'simple'
|
||
|
STRUCTURED = 'structured'
|
||
|
LUCENE = 'lucene'
|
||
|
DISMAX = 'dismax'
|
||
|
|
||
|
|
||
|
class SearchServiceException(Exception):
|
||
|
pass
|
||
|
|
||
|
|
||
|
class SearchResults(object):
|
||
|
def __init__(self, **attrs):
|
||
|
self.rid = attrs['status']['rid']
|
||
|
self.time_ms = attrs['status']['time-ms']
|
||
|
self.hits = attrs['hits']['found']
|
||
|
self.docs = attrs['hits']['hit']
|
||
|
self.start = attrs['hits']['start']
|
||
|
self.query = attrs['query']
|
||
|
self.search_service = attrs['search_service']
|
||
|
|
||
|
self.facets = {}
|
||
|
if 'facets' in attrs:
|
||
|
for (facet, values) in attrs['facets'].items():
|
||
|
if 'buckets' in values:
|
||
|
self.facets[facet] = dict((k, v) for (k, v) in map(lambda x: (x['value'], x['count']), values.get('buckets', [])))
|
||
|
|
||
|
self.num_pages_needed = ceil(self.hits / self.query.real_size)
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.docs)
|
||
|
|
||
|
def __iter__(self):
|
||
|
return iter(self.docs)
|
||
|
|
||
|
def next_page(self):
|
||
|
"""Call Cloudsearch to get the next page of search results
|
||
|
|
||
|
:rtype: :class:`boto.cloudsearch2.search.SearchResults`
|
||
|
:return: the following page of search results
|
||
|
"""
|
||
|
if self.query.page <= self.num_pages_needed:
|
||
|
self.query.start += self.query.real_size
|
||
|
self.query.page += 1
|
||
|
return self.search_service(self.query)
|
||
|
else:
|
||
|
raise StopIteration
|
||
|
|
||
|
|
||
|
class Query(object):
|
||
|
|
||
|
RESULTS_PER_PAGE = 500
|
||
|
|
||
|
def __init__(self, q=None, parser=None, fq=None, expr=None,
|
||
|
return_fields=None, size=10, start=0, sort=None,
|
||
|
facet=None, highlight=None, partial=None, options=None):
|
||
|
|
||
|
self.q = q
|
||
|
self.parser = parser
|
||
|
self.fq = fq
|
||
|
self.expr = expr or {}
|
||
|
self.sort = sort or []
|
||
|
self.return_fields = return_fields or []
|
||
|
self.start = start
|
||
|
self.facet = facet or {}
|
||
|
self.highlight = highlight or {}
|
||
|
self.partial = partial
|
||
|
self.options = options
|
||
|
self.page = 0
|
||
|
self.update_size(size)
|
||
|
|
||
|
def update_size(self, new_size):
|
||
|
self.size = new_size
|
||
|
self.real_size = Query.RESULTS_PER_PAGE if (self.size >
|
||
|
Query.RESULTS_PER_PAGE or self.size == 0) else self.size
|
||
|
|
||
|
def to_params(self):
|
||
|
"""Transform search parameters from instance properties to a dictionary
|
||
|
|
||
|
:rtype: dict
|
||
|
:return: search parameters
|
||
|
"""
|
||
|
params = {'start': self.start, 'size': self.real_size}
|
||
|
|
||
|
if self.q:
|
||
|
params['q'] = self.q
|
||
|
|
||
|
if self.parser:
|
||
|
params['q.parser'] = self.parser
|
||
|
|
||
|
if self.fq:
|
||
|
params['fq'] = self.fq
|
||
|
|
||
|
if self.expr:
|
||
|
for k, v in six.iteritems(self.expr):
|
||
|
params['expr.%s' % k] = v
|
||
|
|
||
|
if self.facet:
|
||
|
for k, v in six.iteritems(self.facet):
|
||
|
if not isinstance(v, six.string_types):
|
||
|
v = json.dumps(v)
|
||
|
params['facet.%s' % k] = v
|
||
|
|
||
|
if self.highlight:
|
||
|
for k, v in six.iteritems(self.highlight):
|
||
|
params['highlight.%s' % k] = v
|
||
|
|
||
|
if self.options:
|
||
|
params['q.options'] = self.options
|
||
|
|
||
|
if self.return_fields:
|
||
|
params['return'] = ','.join(self.return_fields)
|
||
|
|
||
|
if self.partial is not None:
|
||
|
params['partial'] = self.partial
|
||
|
|
||
|
if self.sort:
|
||
|
params['sort'] = ','.join(self.sort)
|
||
|
|
||
|
return params
|
||
|
|
||
|
def to_domain_connection_params(self):
|
||
|
"""
|
||
|
Transform search parameters from instance properties to a dictionary
|
||
|
that CloudSearchDomainConnection can accept
|
||
|
|
||
|
:rtype: dict
|
||
|
:return: search parameters
|
||
|
"""
|
||
|
params = {'start': self.start, 'size': self.real_size}
|
||
|
|
||
|
if self.q:
|
||
|
params['q'] = self.q
|
||
|
|
||
|
if self.parser:
|
||
|
params['query_parser'] = self.parser
|
||
|
|
||
|
if self.fq:
|
||
|
params['filter_query'] = self.fq
|
||
|
|
||
|
if self.expr:
|
||
|
expr = {}
|
||
|
for k, v in six.iteritems(self.expr):
|
||
|
expr['expr.%s' % k] = v
|
||
|
|
||
|
params['expr'] = expr
|
||
|
|
||
|
if self.facet:
|
||
|
facet = {}
|
||
|
for k, v in six.iteritems(self.facet):
|
||
|
if not isinstance(v, six.string_types):
|
||
|
v = json.dumps(v)
|
||
|
facet['facet.%s' % k] = v
|
||
|
|
||
|
params['facet'] = facet
|
||
|
|
||
|
if self.highlight:
|
||
|
highlight = {}
|
||
|
for k, v in six.iteritems(self.highlight):
|
||
|
highlight['highlight.%s' % k] = v
|
||
|
|
||
|
params['highlight'] = highlight
|
||
|
|
||
|
if self.options:
|
||
|
params['query_options'] = self.options
|
||
|
|
||
|
if self.return_fields:
|
||
|
params['ret'] = ','.join(self.return_fields)
|
||
|
|
||
|
if self.partial is not None:
|
||
|
params['partial'] = self.partial
|
||
|
|
||
|
if self.sort:
|
||
|
params['sort'] = ','.join(self.sort)
|
||
|
|
||
|
return params
|
||
|
|
||
|
|
||
|
class SearchConnection(object):
|
||
|
|
||
|
def __init__(self, domain=None, endpoint=None):
|
||
|
self.domain = domain
|
||
|
self.endpoint = endpoint
|
||
|
self.session = requests.Session()
|
||
|
|
||
|
# Endpoint needs to be set before initializing CloudSearchDomainConnection
|
||
|
if not endpoint:
|
||
|
self.endpoint = domain.search_service_endpoint
|
||
|
|
||
|
# Copy proxy settings from connection and check if request should be signed
|
||
|
self.sign_request = False
|
||
|
if self.domain and self.domain.layer1:
|
||
|
if self.domain.layer1.use_proxy:
|
||
|
self.session.proxies['http'] = self.domain.layer1.get_proxy_url_with_auth()
|
||
|
|
||
|
self.sign_request = getattr(self.domain.layer1, 'sign_request', False)
|
||
|
|
||
|
if self.sign_request:
|
||
|
layer1 = self.domain.layer1
|
||
|
self.domain_connection = CloudSearchDomainConnection(
|
||
|
host=self.endpoint,
|
||
|
aws_access_key_id=layer1.aws_access_key_id,
|
||
|
aws_secret_access_key=layer1.aws_secret_access_key,
|
||
|
region=layer1.region,
|
||
|
provider=layer1.provider
|
||
|
)
|
||
|
|
||
|
def build_query(self, q=None, parser=None, fq=None, rank=None, return_fields=None,
|
||
|
size=10, start=0, facet=None, highlight=None, sort=None,
|
||
|
partial=None, options=None):
|
||
|
return Query(q=q, parser=parser, fq=fq, expr=rank, return_fields=return_fields,
|
||
|
size=size, start=start, facet=facet, highlight=highlight,
|
||
|
sort=sort, partial=partial, options=options)
|
||
|
|
||
|
def search(self, q=None, parser=None, fq=None, rank=None, return_fields=None,
|
||
|
size=10, start=0, facet=None, highlight=None, sort=None, partial=None,
|
||
|
options=None):
|
||
|
"""
|
||
|
Send a query to CloudSearch
|
||
|
|
||
|
Each search query should use at least the q or bq argument to specify
|
||
|
the search parameter. The other options are used to specify the
|
||
|
criteria of the search.
|
||
|
|
||
|
:type q: string
|
||
|
:param q: A string to search the default search fields for.
|
||
|
|
||
|
:type parser: string
|
||
|
:param parser: The parser to use. 'simple', 'structured', 'lucene', 'dismax'
|
||
|
|
||
|
:type fq: string
|
||
|
:param fq: The filter query to use.
|
||
|
|
||
|
:type sort: List of strings
|
||
|
:param sort: A list of fields or rank expressions used to order the
|
||
|
search results. Order is handled by adding 'desc' or 'asc' after the field name.
|
||
|
``['year desc', 'author asc']``
|
||
|
|
||
|
:type return_fields: List of strings
|
||
|
:param return_fields: A list of fields which should be returned by the
|
||
|
search. If this field is not specified, only IDs will be returned.
|
||
|
``['headline']``
|
||
|
|
||
|
:type size: int
|
||
|
:param size: Number of search results to specify
|
||
|
|
||
|
:type start: int
|
||
|
:param start: Offset of the first search result to return (can be used
|
||
|
for paging)
|
||
|
|
||
|
:type facet: dict
|
||
|
:param facet: Dictionary of fields for which facets should be returned
|
||
|
The facet value is string of JSON options
|
||
|
``{'year': '{sort:"bucket", size:3}', 'genres': '{buckets:["Action","Adventure","Sci-Fi"]}'}``
|
||
|
|
||
|
:type highlight: dict
|
||
|
:param highlight: Dictionary of fields for which highlights should be returned
|
||
|
The facet value is string of JSON options
|
||
|
``{'genres': '{format:'text',max_phrases:2,pre_tag:'<b>',post_tag:'</b>'}'}``
|
||
|
|
||
|
:type partial: bool
|
||
|
:param partial: Should partial results from a partioned service be returned if
|
||
|
one or more index partitions are unreachable.
|
||
|
|
||
|
:type options: str
|
||
|
:param options: Options for the query parser specified in *parser*.
|
||
|
Specified as a string in JSON format.
|
||
|
``{fields: ['title^5', 'description']}``
|
||
|
|
||
|
:rtype: :class:`boto.cloudsearch2.search.SearchResults`
|
||
|
:return: Returns the results of this search
|
||
|
|
||
|
The following examples all assume we have indexed a set of documents
|
||
|
with fields: *author*, *date*, *headline*
|
||
|
|
||
|
A simple search will look for documents whose default text search
|
||
|
fields will contain the search word exactly:
|
||
|
|
||
|
>>> search(q='Tim') # Return documents with the word Tim in them (but not Timothy)
|
||
|
|
||
|
A simple search with more keywords will return documents whose default
|
||
|
text search fields contain the search strings together or separately.
|
||
|
|
||
|
>>> search(q='Tim apple') # Will match "tim" and "apple"
|
||
|
|
||
|
More complex searches require the boolean search operator.
|
||
|
|
||
|
Wildcard searches can be used to search for any words that start with
|
||
|
the search string.
|
||
|
|
||
|
>>> search(q="'Tim*'") # Return documents with words like Tim or Timothy)
|
||
|
|
||
|
Search terms can also be combined. Allowed operators are "and", "or",
|
||
|
"not", "field", "optional", "token", "phrase", or "filter"
|
||
|
|
||
|
>>> search(q="(and 'Tim' (field author 'John Smith'))", parser='structured')
|
||
|
|
||
|
Facets allow you to show classification information about the search
|
||
|
results. For example, you can retrieve the authors who have written
|
||
|
about Tim with a max of 3
|
||
|
|
||
|
>>> search(q='Tim', facet={'Author': '{sort:"bucket", size:3}'})
|
||
|
"""
|
||
|
|
||
|
query = self.build_query(q=q, parser=parser, fq=fq, rank=rank,
|
||
|
return_fields=return_fields,
|
||
|
size=size, start=start, facet=facet,
|
||
|
highlight=highlight, sort=sort,
|
||
|
partial=partial, options=options)
|
||
|
return self(query)
|
||
|
|
||
|
def _search_with_auth(self, params):
|
||
|
return self.domain_connection.search(params.pop("q", ""), **params)
|
||
|
|
||
|
def _search_without_auth(self, params, api_version):
|
||
|
url = "http://%s/%s/search" % (self.endpoint, api_version)
|
||
|
resp = self.session.get(url, params=params)
|
||
|
|
||
|
return {'body': resp.content.decode('utf-8'), 'status_code': resp.status_code}
|
||
|
|
||
|
def __call__(self, query):
|
||
|
"""Make a call to CloudSearch
|
||
|
|
||
|
:type query: :class:`boto.cloudsearch2.search.Query`
|
||
|
:param query: A group of search criteria
|
||
|
|
||
|
:rtype: :class:`boto.cloudsearch2.search.SearchResults`
|
||
|
:return: search results
|
||
|
"""
|
||
|
api_version = '2013-01-01'
|
||
|
if self.domain and self.domain.layer1:
|
||
|
api_version = self.domain.layer1.APIVersion
|
||
|
|
||
|
if self.sign_request:
|
||
|
data = self._search_with_auth(query.to_domain_connection_params())
|
||
|
else:
|
||
|
r = self._search_without_auth(query.to_params(), api_version)
|
||
|
|
||
|
_body = r['body']
|
||
|
_status_code = r['status_code']
|
||
|
|
||
|
try:
|
||
|
data = json.loads(_body)
|
||
|
except ValueError:
|
||
|
if _status_code == 403:
|
||
|
msg = ''
|
||
|
import re
|
||
|
g = re.search('<html><body><h1>403 Forbidden</h1>([^<]+)<', _body)
|
||
|
try:
|
||
|
msg = ': %s' % (g.groups()[0].strip())
|
||
|
except AttributeError:
|
||
|
pass
|
||
|
raise SearchServiceException('Authentication error from Amazon%s' % msg)
|
||
|
raise SearchServiceException("Got non-json response from Amazon. %s" % _body, query)
|
||
|
|
||
|
if 'messages' in data and 'error' in data:
|
||
|
for m in data['messages']:
|
||
|
if m['severity'] == 'fatal':
|
||
|
raise SearchServiceException("Error processing search %s "
|
||
|
"=> %s" % (params, m['message']), query)
|
||
|
elif 'error' in data:
|
||
|
raise SearchServiceException("Unknown error processing search %s"
|
||
|
% json.dumps(data), query)
|
||
|
|
||
|
data['query'] = query
|
||
|
data['search_service'] = self
|
||
|
|
||
|
return SearchResults(**data)
|
||
|
|
||
|
def get_all_paged(self, query, per_page):
|
||
|
"""Get a generator to iterate over all pages of search results
|
||
|
|
||
|
:type query: :class:`boto.cloudsearch2.search.Query`
|
||
|
:param query: A group of search criteria
|
||
|
|
||
|
:type per_page: int
|
||
|
:param per_page: Number of docs in each :class:`boto.cloudsearch2.search.SearchResults` object.
|
||
|
|
||
|
:rtype: generator
|
||
|
:return: Generator containing :class:`boto.cloudsearch2.search.SearchResults`
|
||
|
"""
|
||
|
query.update_size(per_page)
|
||
|
page = 0
|
||
|
num_pages_needed = 0
|
||
|
while page <= num_pages_needed:
|
||
|
results = self(query)
|
||
|
num_pages_needed = results.num_pages_needed
|
||
|
yield results
|
||
|
query.start += query.real_size
|
||
|
page += 1
|
||
|
|
||
|
def get_all_hits(self, query):
|
||
|
"""Get a generator to iterate over all search results
|
||
|
|
||
|
Transparently handles the results paging from Cloudsearch
|
||
|
search results so even if you have many thousands of results
|
||
|
you can iterate over all results in a reasonably efficient
|
||
|
manner.
|
||
|
|
||
|
:type query: :class:`boto.cloudsearch2.search.Query`
|
||
|
:param query: A group of search criteria
|
||
|
|
||
|
:rtype: generator
|
||
|
:return: All docs matching query
|
||
|
"""
|
||
|
page = 0
|
||
|
num_pages_needed = 0
|
||
|
while page <= num_pages_needed:
|
||
|
results = self(query)
|
||
|
num_pages_needed = results.num_pages_needed
|
||
|
for doc in results:
|
||
|
yield doc
|
||
|
query.start += query.real_size
|
||
|
page += 1
|
||
|
|
||
|
def get_num_hits(self, query):
|
||
|
"""Return the total number of hits for query
|
||
|
|
||
|
:type query: :class:`boto.cloudsearch2.search.Query`
|
||
|
:param query: a group of search criteria
|
||
|
|
||
|
:rtype: int
|
||
|
:return: Total number of hits for query
|
||
|
"""
|
||
|
query.update_size(1)
|
||
|
return self(query).hits
|