Merge pull request #174 from cul-it/hotfix/0.1.1

Hotfix/0.1.1
arXiv · Apr 26, 2018 · a6c3eba · a6c3eba
2 parents c58c2f8 dc45bbd
commit a6c3eba
Show file tree

Hide file tree

Showing 19 changed files with 183 additions and 66 deletions.
diff --git a/Pipfile b/Pipfile
@@ -7,7 7,7 @@ name = "pypi"
 
 [packages]
 
-arxiv-base = "==0.5.1"
 arxiv-base = "==0.6.1"
 boto = "==2.48.0"
 "boto3" = "==1.6.6"
 botocore = "==1.9.6"
@@ -19,15 19,15 @@ dataclasses = "==0.4"
 docutils = "==0.14"
 elasticsearch = "==6.2.0"
 elasticsearch-dsl = "==6.1.0"
-Flask = "==0.12.2"
-"Flask-S3" = "==0.3.3"
 flask = "==0.12.2"
 "flask-s3" = "==0.3.3"
 idna = "==2.6"
 ipaddress = "==1.0.19"
 itsdangerous = "==0.24"
-"Jinja2" = "==2.10"
 "jinja2" = "==2.10"
 jmespath = "==0.9.3"
 jsonschema = "==2.6.0"
-MarkupSafe = "==1.0"
 markupsafe = "==1.0"
 mccabe = "==0.6.1"
 mock = "==2.0.0"
 mypy = "==0.560"
@@ -44,11 44,11 @@ requests = "==2.18.4"
 "s3transfer" = "==0.1.13"
 snowballstemmer = "==1.2.1"
 thrift = "==0.11.0"
-thrift_connector = "==0.23"
 thrift-connector = "==0.23"
 typed-ast = "==1.1.0"
 "urllib3" = "==1.22"
-Werkzeug = "==0.13"
-WTForms = "==2.1"
 werkzeug = "==0.13"
 wtforms = "==2.1"
 bleach = "*"
 
 

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,5 1,5 @@
 amazon-kclpy==1.4.4
-arxiv-base==0.5.1
 arxiv-base==0.6.1
 boto==2.48.0
 boto3==1.6.6
 botocore==1.9.6

diff --git a/requirements/prod.txt b/requirements/prod.txt
@@ -1,5 1,5 @@
 amazon-kclpy==1.4.4
-arxiv-base==0.5.1
 arxiv-base==0.6.1
 boto==2.48.0
 boto3==1.6.6
 botocore==1.9.6

diff --git a/search/controllers/advanced/forms.py b/search/controllers/advanced/forms.py
@@ -170,6 170,6 @@ class AdvancedSearchForm(Form):
  ('-submitted_date', 'Submission date (newest first)'),
  ('submitted_date', 'Submission date (oldest first)'),
  ('', 'Relevance')
- ], validators=[validators.Optional()], default='-announced_date_first')
  ], validators=[validators.Optional()], default='')
  include_older_versions = BooleanField('Include older versions '
  'of papers in results')
diff --git a/search/controllers/simple/__init__.py b/search/controllers/simple/__init__.py
@@ -84,11 84,18 @@ def search(request_params: dict) -> Response:
  # Fall back to form-based search.
  form = SimpleSearchForm(request_params)
 
- # Temporary workaround to support classic help search
- if form.query.data and form.searchtype.data == 'help':
- return {}, status.HTTP_301_MOVED_PERMANENTLY,\
- {'Location': 'https://arxiv.org/help/search?method=and'
- f'&format=builtin-short&sort=score&words={form.query.data}'}
  if form.query.data:
  # Temporary workaround to support classic help search
  if form.searchtype.data == 'help':
  return {}, status.HTTP_301_MOVED_PERMANENTLY,\
  {'Location': 'https://arxiv.org/help/search?method=and'
  f'&format=builtin-short&sort=score&words={form.query.data}'}
 
  # Support classic "expeirmental" search
  elif form.searchtype.data == 'full_text':
  return {}, status.HTTP_301_MOVED_PERMANENTLY,\
  {'Location': 'http://search.arxiv.org:8081/'
  f'?in=&query={form.query.data}'}
 
  q: Optional[Query]
  if form.validate():
@@ -120,6 127,9 @@ def search(request_params: dict) -> Response:
  "search again. If this problem persists, please report it to "
  "[email protected]."
  ) from e
  except Exception as e:
  print(e)
  raise 
  else:
  logger.debug('form is invalid: %s', str(form.errors))
  if 'order' in form.errors or 'size' in form.errors:

diff --git a/search/controllers/simple/forms.py b/search/controllers/simple/forms.py
@@ -42,7 42,7 @@ class SimpleSearchForm(Form):
  ('-submitted_date', 'Submission date (newest first)'),
  ('submitted_date', 'Submission date (oldest first)'),
  ('', 'Relevance')
- ], validators=[validators.Optional()], default='-announced_date_first')
  ], validators=[validators.Optional()], default='')
 
  def validate_query(form: Form, field: StringField) -> None:
  """Validate the length of the querystring, if searchtype is set."""

diff --git a/search/process/tests.py b/search/process/tests.py
@@ -18,8 18,11 @@ def test_id(self):
  self.assertEqual(doc.id, '1234.56789')
 
  def test_abstract(self):
- """Field ``abstract`` is populated from ``abstract``."""
- meta = DocMeta(**{'paper_id': '1234.56789', 'abstract': 'abstract!'})
  """Field ``abstract`` is populated from ``abstract_utf8``."""
  meta = DocMeta(**{
  'paper_id': '1234.56789',
  'abstract_utf8': 'abstract!'
  })
  doc = transform.to_search_document(meta)
  self.assertEqual(doc.abstract, 'abstract!')
 

diff --git a/search/process/transform.py b/search/process/transform.py
@@ -43,6 43,8 @@ def _constructACMClass(meta: DocMeta) -> Optional[list]:
 
 
 def _transformAuthor(author: dict) -> dict:
  # TODO: we should not be stripping punctuation from the name here.
  # This should be handled by the analyzer. This is related to ARXIVNG-543.
  author['first_name'] = _strip_punctuation(author['first_name']).strip()
  author['full_name'] = re.sub(r'\s ', ' ', f"{author['first_name']} {author['last_name']}")
  author['initials'] = [pt[0] for pt in author['first_name'].split() if pt]
@@ -81,7 83,7 @@ def _constructDOI(meta: DocMeta) -> List[str]:
 TransformType = Union[str, Callable]
 _transformations: List[Tuple[str, TransformType, bool]] = [
  ("id", lambda meta: meta.paper_id if meta.is_current else _constructPaperVersion(meta), True),
- ("abstract", "abstract", False),
  ("abstract", "abstract_utf8", False),
  ("authors", _constructAuthors, True),
  ("authors_freeform", "authors_utf8", False),
  ("owners", _constructAuthorOwners, False),

diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py
@@ -92,9 92,6 @@ def handle_es_exceptions() -> Generator:
 class SearchSession(object):
  """Encapsulates session with Elasticsearch host."""
 
- # TODO: we need to take on security considerations here. Presumably we will
- # use SSL. Presumably we will use HTTP Auth, or something else.
-
  def __init__(self, host: str, index: str, port: int=9200,
  scheme: str='http', user: Optional[str]=None,
  password: Optional[str]=None, mapping: Optional[str]=None,

diff --git a/search/services/index/authors.py b/search/services/index/authors.py
@@ -1,16 1,41 @@
 """Query-builders and helpers for searching by author name."""
 
 from typing import Tuple, Optional, List
 import re
 from string import punctuation
 from elasticsearch_dsl import Search, Q, SF
-from .util import wildcardEscape, is_literal_query, Q_
 from .util import wildcardEscape, is_literal_query, Q_, escape
 
 
 STOP = ["and", "or", "the", "of", "a", "for", "an"]
 
 
 # TODO: remove this when we address the author name bug in
 # search.process.transform..
 def _strip_punctuation(s: str) -> str:
  return ''.join([c for c in s if c not in punctuation])
 
 
 # TODO: revisit author name indexing in document mappings.
 # Ideally stopwords would be removed at index time, but authors are indexed
 # as keywords which makes that difficult.
 def _remove_stopwords(term: str) -> str:
  """Remove common stopwords that will match on institutions."""
  _term = str(term)
  for stopword in STOP:
  _term =re.sub(f"(^|\s ){stopword}(\s |$)", " ", _term)
  return _term
 
 
 def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
  """Parse a name string into its (likely) constituent parts."""
  # We interpret the comma as separating the surname from the forename.
  if "," in au_safe:
- surname, forename = au_safe.split(',')
- return surname.strip(), forename.strip()
  au_parts = au_safe.split(',')
  if len(au_parts) >= 2:
  surname = au_parts[0]
  forename = au_parts[1]
  return surname.strip(), forename.strip()
 
  # Otherwise, treat the last word in the name as the surname. This isn't
  # a great approach from first principles, but it produces reasonable
@@ -25,6 50,7 @@ def _parseName(au_safe: str) -> Tuple[str, Optional[str]]:
 # pieces, for readability.
 def construct_author_query(term: str) -> Q:
  """Generate an author name query in the ElasticSearch DSL."""
  term = escape(term)
  _author_q = Q()
  score_functions: List = []
 
@@ -35,24 61,41 @@ def construct_author_query(term: str) -> Q:
  au_name, has_wildcard = wildcardEscape(au_name)
  au_safe = au_name.replace('*', '').replace('?', '').replace('"', '')
  surname_safe, forename_safe = _parseName(au_safe)
 
  if forename_safe is not None:
  # TODO: remove this when the author name bug is fixed in
  # search.process.transform. Since we are erroneously removing
  # punctuation from author names prior to indexing, it's important
  # to do the same here so that results are returned.
  forename_safe = _strip_punctuation(forename_safe)
 
  fullname_safe = f'{forename_safe} {surname_safe}'
  else:
  fullname_safe = surname_safe
  _q = (
  # Matching on keyword field is effectively an exact match.
  Q('match', **{
  'authors__full_name__exact': {
- 'query': fullname_safe, 'boost': 10
- }
  'query': fullname_safe, 'boost': 30
  },
  })
 
  # The next best case is that the query is a substring of
  # the full name.
  | Q('match_phrase', **{
  'authors__full_name': {'query': fullname_safe, 'boost': 9}
  })
  )
  if not is_literal_query(term):
  # Search across all authors, and prefer documents for which a
  # greater number of authors respond. For this part of the search
  # we want to avoid artificially high scores when only initials
  # match, so we drop solo characters from the query.
  term_sans_inits = ' '.join(part for part in
  _remove_stopwords(term).split()
  if len(part) > 1)
  _q |= Q('multi_match', fields=['authors.full_name'],
  query=term_sans_inits, boost=8, type="cross_fields")
  # We support wildcards (?*) within each author name. Since
  # ES will treat the non-wildcard part of the term as a literal,
  # we need to apply each word in the name separately.
@@ -89,7 132,8 @@ def construct_author_query(term: str) -> Q:
  'match', **{
  'authors__full_name': fullname_safe
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -99,7 143,8 @@ def construct_author_query(term: str) -> Q:
  'match', **{
  'authors__full_name_initialized': au_safe
  }
- )
  ),
  score_mode='sum'
  )
  })
  ]
@@ -115,7 160,8 @@ def construct_author_query(term: str) -> Q:
  'match', **{
  'authors__last_name': surname_safe
  }
- )
  ),
  score_mode='sum'
  )
  }),
  ]
@@ -151,7 197,8 @@ def construct_author_query(term: str) -> Q:
  "match", **{
  "authors__first_name__exact": forename_safe
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -161,7 208,8 @@ def construct_author_query(term: str) -> Q:
  "match", **{
  "authors__first_name__exact": init_forename
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -170,7 218,8 @@ def construct_author_query(term: str) -> Q:
  "match_phrase", **{
  "authors__first_name": forename_safe
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -179,7 228,8 @@ def construct_author_query(term: str) -> Q:
  "match_phrase", **{
  "authors__first_name": init_forename
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -188,7 238,8 @@ def construct_author_query(term: str) -> Q:
  "match", **{
  "authors__first_name": forename_safe
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -197,7 248,8 @@ def construct_author_query(term: str) -> Q:
  "match", **{
  "authors__first_name": init_forename
  }
- )
  ),
  score_mode='sum'
  )
  }),
  SF({
@@ -206,11 258,12 @@ def construct_author_query(term: str) -> Q:
  "match", **{
  "authors__initials": init_forename.lower()
  }
- )
  ),
  score_mode='sum'
  )
  }),
  ]
- _author_q &= Q("nested", path="authors", query=_q)
  _author_q &= Q("nested", path="authors", query=_q, score_mode='sum')
 
  return Q('function_score', query=_author_q,
  score_mode="sum", boost=1, boost_mode='multiply',