Skip to content

Commit

Permalink
pilot: several improvements
Browse files Browse the repository at this point in the history
* Uses an existing DB for deployment.
* Implements lazy reading of xml files.
* Adds xml record extraction.
* Adds operation_logs pid dependencies checking.
* Adds paramter for error records saving during record creation.
* Fixes item creation with holdings without paterns.
* Fixes editor options for docmaintype_audio subtypes closes: rero#1712.

Co-Authored-by: Peter Weber <[email protected]>
  • Loading branch information
rerowep committed Feb 23, 2021
1 parent 6392a25 commit 80fcc81
Show file tree
Hide file tree
Showing 8 changed files with 209 additions and 39 deletions.
38 changes: 38 additions & 0 deletions data/pid_dependencies.json
Original file line number Diff line number Diff line change
Expand Up @@ -251,5 +251,43 @@
}
}
]
},
{
"name": "operation_logs_creations",
"filename": "operation_logs_creations.json",
"dependencies": [
{
"name": "organisation",
"optional": "True"
},
{
"name": "record",
"refs": {
"organisation": "organisations",
"item": "items",
"holding": "holdings",
"document": "documents"
}
}
]
},
{
"name": "operation_logs_updates",
"filename": "operation_logs_updates.json",
"dependencies": [
{
"name": "organisation",
"optional": "True"
},
{
"name": "record",
"refs": {
"organisation": "organisations",
"item": "items",
"holding": "holdings",
"document": "documents"
}
}
]
}
]
19 changes: 14 additions & 5 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ flask-cors = ">3.0.8"
celery = ">=5.0.0"
cryptography = ">3.2"
freezegun = "^1.1.0"
lazyreader = ">1.0.0"

[tool.poetry.dev-dependencies]
## Python packages development dependencies (order matters)
Expand Down
120 changes: 97 additions & 23 deletions rero_ils/modules/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from __future__ import absolute_import, print_function

import difflib
import gc
import itertools
import json
import logging
Expand All @@ -43,7 +42,7 @@
import yaml
from babel import Locale, core
from celery.bin.control import inspect
from dojson.contrib.marc21.utils import create_record, split_stream
from dojson.contrib.marc21.utils import create_record
from elasticsearch_dsl.query import Q
from flask import current_app
from flask.cli import with_appcontext
Expand Down Expand Up @@ -76,7 +75,8 @@
from .operation_logs.cli import migrate_virtua_operation_logs
from .patrons.cli import import_users
from .tasks import process_bulk_queue
from .utils import get_record_class_from_schema_or_pid_type, read_json_record
from .utils import get_record_class_from_schema_or_pid_type, \
read_json_record, read_xml_record
from ..modules.providers import append_fixtures_new_identifiers
from ..modules.utils import get_schema_for_resource

Expand Down Expand Up @@ -296,10 +296,11 @@ def init_index(force):
is_flag=True, default=False)
@click.option('-P', '--pid-check', 'pid_check',
is_flag=True, default=False)
@click.option('-e', '--save_errors', 'save_errors', type=click.File('w'))
@click.argument('infile', type=click.File('r'), default=sys.stdin)
@with_appcontext
def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
pid_type, lazy, dont_stop_on_error, pid_check):
pid_type, lazy, dont_stop_on_error, pid_check, save_errors):
"""Load REROILS record.
:param infile: Json file
Expand All @@ -312,6 +313,7 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
:param lazy: lazy reads file
:param dont_stop_on_error: don't stop on error
:param pidcheck: check pids
:param save_errors: save error records to file
"""
click.secho(
'Loading {pid_type} records from {file_name}.'.format(
Expand All @@ -323,7 +325,14 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,

record_class = get_record_class_from_schema_or_pid_type(pid_type=pid_type)

error_records = []
if save_errors:
errors = 0
name, ext = os.path.splitext(infile.name)
err_file_name = '{name}_errors{ext}'.format(name=name, ext=ext)
error_file = open(err_file_name, 'w')
error_file.write('[\n')
error_file.close()

pids = []
if lazy:
# try to lazy read json file (slower, better memory management)
Expand All @@ -350,7 +359,6 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
)
)
except Exception as err:
error_records.append(record)
click.secho(
'{count: <8} {type} create error {pid}: {err}'.format(
count=count,
Expand All @@ -362,6 +370,11 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
)
if debug:
traceback.print_exc()

if save_errors:
if errors > 0:
error_file.write(',\n')
error_file.write(json.dumps(record, indent=2))
if not dont_stop_on_error:
sys.exit(1)
db.session.flush()
Expand All @@ -372,6 +385,9 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
click.echo('DB commit: {count}'.format(count=count))
db.session.commit()

if save_errors:
error_file.write(']')

if append:
click.secho(
'Append fixtures new identifiers: {len}'.format(len=len(pids))
Expand All @@ -389,17 +405,6 @@ def create(infile, append, reindex, dbcommit, commit, verbose, debug, schema,
fg='red'
)

if error_records:
name, ext = os.path.splitext(infile.name)
err_file_name = '{name}_errors{ext}'.format(name=name, ext=ext)
click.secho('Write error file: {name}'.format(name=err_file_name))
with open(err_file_name, 'w') as error_file:
error_file.write('[\n')
for error_record in error_records:
for line in json.dumps(error_record, indent=2).split('\n'):
error_file.write(' ' + line + '\n')
error_file.write(']')


@fixtures.command('count')
@click.option('-l', '--lazy', 'lazy', is_flag=True, default=False)
Expand Down Expand Up @@ -779,7 +784,7 @@ class Marc21toJson():
'schema']

def __init__(self, xml_file, json_file_ok, xml_file_error,
parallel=8, chunk=5000,
parallel=8, chunk=10000,
verbose=False, debug=False, pid_required=False, schema=None):
"""Constructor."""
self.count = 0
Expand Down Expand Up @@ -831,8 +836,6 @@ def write_results(self):
else:
self.count_ko += 1
self.xml_file_error.write(data)
# free memory from garbage collector
gc.collect()

def wait_free_process(self):
"""Wait for next process to finish."""
Expand Down Expand Up @@ -896,7 +899,7 @@ def write_stop(self):
def start(self):
"""Start the transformation."""
self.write_start()
for marc21xml in split_stream(self.xml_file):
for marc21xml in read_xml_record(self.xml_file):
marc21json_record = create_record(marc21xml)
self.active_records.append({
'json': marc21json_record,
Expand Down Expand Up @@ -937,11 +940,11 @@ def active_records(self):


@utils.command('marc21tojson')
@click.argument("xml_file", type=click.File("rb"))
@click.argument("xml_file", type=click.File("r"))
@click.argument('json_file_ok', type=click.File('w'))
@click.argument('xml_file_error', type=click.File('wb'))
@click.option('-p', '--parallel', 'parallel', default=8)
@click.option("-c", "--chunk", "chunk", default=5000)
@click.option("-c", "--chunk", "chunk", default=10000)
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
@click.option('-d', '--debug', 'debug', is_flag=True, default=False)
@click.option('-r', '--pidrequired', 'pid_required', is_flag=True,
Expand Down Expand Up @@ -975,6 +978,77 @@ def marc21json(xml_file, json_file_ok, xml_file_error, parallel, chunk,
click.secho(str(count_ko))


@utils.command('extract_from_xml')
@click.argument('pid_file', type=click.File('r'))
@click.argument('xml_file_in', type=click.File('r'))
@click.argument('xml_file_out', type=click.File('wb'))
@click.option('-t', '--tag', 'tag', default='001')
@click.option('-p', '--progress', 'progress', is_flag=True, default=False)
@click.option('-v', '--verbose', 'verbose', is_flag=True, default=False)
def extract_from_xml(pid_file, xml_file_in, xml_file_out, tag, progress,
verbose):
"""Extracts xml records with pids."""
click.secho('Extract pids from xml: ', fg='green')
click.secho('PID file : {file_name}'.format(file_name=pid_file.name))
click.secho('XML file in : {file_name}'.format(file_name=xml_file_in.name))
click.secho('XML file out: {file_name}'.format(
file_name=xml_file_out.name))

pids = {}
found_pids = {}
for line in pid_file:
pids[line.strip()] = 0
count = len(pids)
click.secho('Search pids count: {count}'.format(count=count))
xml_file_out.write(b'<?xml version="1.0" encoding="UTF-8"?>\n')
xml_file_out.write(
b'<collection xmlns="http://www.loc.gov/MARC21/slim">\n\n'
)
found = 0
for idx, xml in enumerate(read_xml_record(xml_file_in)):
for child in xml:
is_controlfield = child.tag == 'controlfield'
is_tag = child.get('tag') == tag
if is_controlfield and is_tag:
if progress:
click.secho(
'{idx} {pid}'.format(
idx=idx,
pid=repr(child.text)
),
nl='\r'
)
if pids.get(child.text, -1) >= 0:
found += 1
pids[child.text] += 1
data = etree.tostring(
xml,
pretty_print=True,
encoding='UTF-8'
).strip()

xml_file_out.write(data)
found_pids[child.text] = True
if verbose:
click.secho('Found: {pid} on position: {idx}'.format(
pid=child.text,
idx=idx
))
break
xml_file_out.write(b'\n</collection>')
if count != found:
click.secho(
'Count: {count} Found: {found}'.format(
count=count,
found=found
),
fg='red'
)
for key, value in pids.items():
if value == 0:
click.secho(key)


@utils.command('reserve_pid_range')
@click.option('-t', '--pid_type', 'pid_type', default=None,
help='pid type of the resource')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,6 @@
],
"form": {
"options": [
{
"label": "docsubtype_music",
"value": "docsubtype_music"
},
{
"label": "docsubtype_music",
"value": "docsubtype_music"
Expand All @@ -119,6 +115,10 @@
"label": "docsubtype_sound",
"value": "docsubtype_sound"
},
{
"label": "docsubtype_audio_book",
"value": "docsubtype_audio_book"
},
{
"label": "docsubtype_recorded_words",
"value": "docsubtype_recorded_words"
Expand Down
6 changes: 2 additions & 4 deletions rero_ils/modules/items/api/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,13 @@ def _increment_next_prediction_for_holding(
from ...holdings.api import Holding
holding = Holding.get_record_by_pid(item.holding_pid)
if item.get('type') == 'issue' and \
item.get("issue", {}).get("regular") and \
item.get("issue", {}).get("regular") and \
holding.holdings_type == 'serial' and \
holding.get('patterns') and \
holding.get('patterns', {}).get('frequency') != 'rdafr:1016':
updated_holding = holding.increment_next_prediction()
holding = Holding.get_record_by_pid(item.holding_pid)
holding.update(data=updated_holding,
dbcommit=dbcommit, reindex=reindex)
holding = holding.update(data=updated_holding, dbcommit=dbcommit,
reindex=reindex)

@classmethod
def _item_build_org_ref(cls, data):
Expand Down
Loading

0 comments on commit 80fcc81

Please sign in to comment.