Skip to content

Commit

Permalink
import: add new sources to import from the web
Browse files Browse the repository at this point in the history
* Adds Library of Congress catalog.
* Adds DNB catalog.
* Adds SLSP catalog.
* Adds UGent catalog.
* Adds KULeuven catalog.
* Adds import record serializer factory.
* Fixes import subtype facets.
* Updates unimarc mapping for BNF import (Tag 464).
* Adds API endpoint to export external source configuration.

* Closes rero#2065.
* Closes rero#1825.

Co-Authored-by: Benoit Erken <[email protected]>
Co-Authored-by: Laurent Dubois <[email protected]>
Co-Authored-by: Renaud Michotte <[email protected]>
  • Loading branch information
3 people committed Nov 5, 2021
1 parent 70ab240 commit c00a06f
Show file tree
Hide file tree
Showing 51 changed files with 46,762 additions and 96 deletions.
63 changes: 57 additions & 6 deletions rero_ils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2808,14 2808,65 @@ def _(x):
'markdown_captions'
))

# IMPORT
# ====
RERO_IMPORT_REST_ENDPOINTS = dict(
bnf=dict(
# IMPORT FROM EXTERNAL SOURCE CONFIGURATION
# =============================================================================
# Endpoint to load data from external repository. Each endpoint must be
# defined as a dict with the following keys:
# * key: (required) the endpoint key (used to build the API endpoint)
# * import_class: (required) the class used to import the external
# document from this source.
# * import_size: (required) the max number of document returned when
# searching on this source.
# * label: (required) the label used into the professional interface for
# this source. This label will be untranslated.
# * weight: (optional) Used to sort the sources into the professional
# interface. Default value is 100. Lower is the weight, higher
# is the priority.

RERO_IMPORT_REST_ENDPOINTS = [
dict(
key='loc',
import_class='rero_ils.modules.imports.api:LoCImport',
import_size=50,
label='Library of Congress',
weight=70
),
dict(
key='bnf',
import_class='rero_ils.modules.imports.api:BnfImport',
import_size=50
import_size=50,
label='BNF',
weight=20
),
dict(
key='dnb',
import_class='rero_ils.modules.imports.api:DNBImport',
import_size=50,
label='DNB',
weight=20
),
dict(
key='slsp',
import_class='rero_ils.modules.imports.api:SLSPImport',
import_size=50,
label='SLSP',
weight=15
),
dict(
key='ugent',
import_class='rero_ils.modules.imports.api:UGentImport',
import_size=50,
label='UGent',
weight=30
),
dict(
key='kul',
import_class='rero_ils.modules.imports.api:KULImport',
import_size=50,
label='KULeuven',
weight=30
)
)
]

# SRU
# ====
Expand Down
41 changes: 35 additions & 6 deletions rero_ils/dojson/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 351,9 @@ def get_field_items(value):
def build_string_from_subfields(value, subfield_selection, separator=' '):
"""Build a string parsing the selected subfields in order."""
items = get_field_items(value)
parts = [value for key, value in items if key in subfield_selection]
# remove special character from string
parts = [value.replace('\u0098', '').replace('\u009C', '')
for key, value in items if key in subfield_selection]
return separator.join(parts)


Expand Down Expand Up @@ -399,12 401,34 @@ def get_contribution_link(bibid, reroid, id, key):
prod_host = 'mef.rero.ch'
test_host = os.environ.get('RERO_ILS_MEF_HOST', 'mef.rero.ch')
mef_url = f'https://{test_host}/api/'

match = re_identified.search(id)
if type(id) is str:
match = re_identified.search(id)
else:
match = re_identified.search(id[0])
if match and len(match.groups()) == 2 and key[:3] in _CONTRIBUTION_TAGS:
match_type = match.group(1).lower()
match_value = match.group(2)
if match_type == 'idref':
match_type.replace('de-588', 'gnd')
# if we have a viafid, look for the contributor
if match_type == "viaf":
url = f'{mef_url}/mef/?q=viaf_pid:{match_value}'
response = requests_retry_session().get(url)
status_code = response.status_code
if status_code == requests.codes.ok:
try:
if response.json()['hits']['hits'][0]\
['metadata']['idref']['pid']:
match_value = response.json()['hits']['hits'][0]\
['metadata']['idref']['pid']
match_type = 'idref'
elif response.json()['hits']['hits'][0]\
['metadata']['gnd']['pid']:
match_value = response.json()['hits']['hits'][0]\
['metadata']['idref']['pid']
match_type = 'gnd'
except Exception as err:
pass
if match_type == 'idref' or match_type == 'gnd':
url = f'{mef_url}{match_type}/{match_value}'
response = requests_retry_session().get(url)
status_code = response.status_code
Expand Down Expand Up @@ -680,6 704,7 @@ def clean_punctuation(value, punct, spaced_punct):
data = []
value = clean_punctuation(label, punct, spaced_punct).strip()
if value:
value = value.replace('\u0098', '').replace('\u009C', '')
data = [{'value': value}]
else:
error_print('WARNING NO VALUE:', self.bib_id, self.rero_id, tag,
Expand Down Expand Up @@ -817,7 842,7 @@ def extract_description_from_marc_field(self, key, value, data):
book_formats.append(book_format)
dim = remove_trailing_punctuation(
data=dimension.rstrip(),
punctuation=' ,:;&'
punctuation=' ,:;&.'
)
if dim:
add_data_and_sort_list(
Expand Down Expand Up @@ -1141,9 1166,12 @@ def init_country(self):
self.rero_id, cantons_codes)
if self.cantons:
self.country = 'sz'
if self.country is None:
self.country = self.field_008_data[15:18].rstrip()
else:
try:
self.country = self.field_008_data[15:18].rstrip()

except Exception as err:
pass

Expand Down Expand Up @@ -1853,7 1881,8 @@ def build_identifier(data):
'RERO': 'RERO',
'RERO-RAMEAU': 'RERO-RAMEAU',
'IDREF': 'IdRef',
'GND': 'GND'
'GND': 'GND',
'DE-588': 'GND'
}
result = {}
data_0 = utils.force_list(data.get('0'))
Expand Down
10 changes: 8 additions & 2 deletions rero_ils/modules/documents/dojson/contrib/marc21tojson/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 2,7 @@
#
# RERO ILS
# Copyright (C) 2019 RERO
# Copyright (C) 2021 UCLOUVAIN
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
Expand Down Expand Up @@ -1282,13 1283,18 @@ def marc21_to_identifiedBy_from_field_035(self, key, value):
"""Get identifier from field 035."""
subfield_a = not_repetitive(marc21.bib_id, marc21.rero_id,
key, value, 'a', default='').strip()
identifiedBy = self.get('identifiedBy', [])
if subfield_a:
source = 'RERO'
# search source between parenthesis
match = re.search(r'\(([^()] )\)', subfield_a)
if match:
source = match.group(1)
identifier = {
'value': subfield_a,
'type': 'bf:Local',
'source': 'RERO'
'source': source,
}
identifiedBy = self.get('identifiedBy', [])
identifiedBy.append(identifier)
return identifiedBy or None

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 1,23 @@
# -*- coding: utf-8 -*-
#
# RERO ILS
# Copyright (C) 2021 RERO
# Copyright (C) 2021 UCLOUVAIN
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""MARC21 RERO to JSON."""

from .model import marc21

__all__ = ('marc21')
Loading

0 comments on commit c00a06f

Please sign in to comment.