Skip to content

Commit

Permalink
Merge pull request #101 from Congress-Dev/master-parser-fix
Browse files Browse the repository at this point in the history
Master parser fix
  • Loading branch information
mustyoshi authored Dec 22, 2024
2 parents 9f5aa52 fa30857 commit 0fe5e61
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 41 deletions.
13 changes: 7 additions & 6 deletions backend/billparser/actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 136,8 @@ def determine_action(text: str) -> dict:
actions[action] = gg
break
elif action == "TRANSFER-FUNDS":
print("No match for", action, text)
# print("No match for", action, text)
pass
return actions


Expand Down Expand Up @@ -180,15 181,15 @@ def set_diff_id(self, diff_id):
self.diff_id = diff_id

def set_action(self, action):
print("set_action")
# print("set_action")
# TODO: We need a way to say the cite is fully parsed already
# print(action)
self.action = action
within = action.get("within", None)
target = action.get("target", None)
print(f"{self.parsed_cite=}")
# print(f"{self.parsed_cite=}")
if self.parsed_cite == "" and (self.last_title != "") and (target is not None):
print("Parse such code")
# print("Parse such code")
if within is None or within.lower() == "such code":
# print("suchcode")
try:
Expand All @@ -204,11 205,11 @@ def set_action(self, action):
self.parsed_cite = "/".join(
[self.parsed_cite] SubParts.findall(action.get("target", ""))
)
print(f"{self.parsed_cite=}")
# print(f"{self.parsed_cite=}")
target_section = action.get("target_section", None)

if target_section is not None and len(self.parsed_cite.split("/")) < 5:
print("Add target section", self.parsed_cite.split("/"))
# print("Add target section", self.parsed_cite.split("/"))
self.parsed_cite = "/".join(
[self.parsed_cite] SubParts.findall(target_section)
)
Expand Down
13 changes: 10 additions & 3 deletions backend/billparser/db/handler.py
Original file line number Diff line number Diff line change
@@ -1,11 1,12 @@
import os
import re
import string
import time

from lxml import etree
from unidecode import unidecode # GPLV2
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.pool import NullPool

from billparser.utils.citation import resolve_citations
Expand All @@ -22,10 23,16 @@
engine = create_engine(DATABASE_URI, poolclass=NullPool, connect_args={'sslmode': "disable"})

Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine, query_cls=query_callable(regions))
ribber = string.ascii_letters string.digits

ribber = string.ascii_letters string.digits
Session = scoped_session(sessionmaker(bind=engine, query_cls=query_callable(regions)))

def init_session():
"""Initialize the Session object in the current process."""
global Session
engine = create_engine(DATABASE_URI, poolclass=NullPool, connect_args={'sslmode': "disable"})
Session = scoped_session(sessionmaker(bind=engine))

def unidecode_str(input_str: str) -> str:
return unidecode(input_str or "").replace("--", "-")

Expand Down
2 changes: 1 addition & 1 deletion backend/billparser/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 57,7 @@ def determine_action(xref):
# TODO: Unused function?
par = xref.getparent()
texts = [x for x in par.itertext() if x != xref.text]
print("actions", texts[-1])
# print("actions", texts[-1])
if "adding at the end the following" in texts[-1]:
return "APPEND"
if "is amended—" in texts[-1]:
Expand Down
59 changes: 34 additions & 25 deletions backend/billparser/run_through.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 47,7 @@

from billparser.utils.logger import LogContext
from billparser.utils.cite_parser import parse_action_for_cite
from billparser.db.handler import Session
from billparser.db.handler import Session, init_session
from billparser.translater import translate_paragraph

from joblib import Parallel, delayed
Expand Down Expand Up @@ -349,7 349,7 @@ def recursive_bill_content(
f"Items look like: {search_element.tag} and {len(search_element)}"
)
if content is not None:
print(content.content_type, content.content_str)
# print(content.content_type, content.content_str)
session.add(content)
if True:
root_path = search_element.getroottree().getpath(search_element)
Expand Down Expand Up @@ -413,7 413,8 @@ def recursive_bill_content(
return res


def check_for_existing_legislation_version(bill_obj: object, session) -> bool:
def check_for_existing_legislation_version(bill_obj: object) -> bool:
session = Session()
# Check to see if we've already ingested this bill
existing_legis = (
session.query(Legislation)
Expand Down Expand Up @@ -464,6 465,7 @@ def retrieve_existing_legislations(session) -> List[dict]:


def parse_bill(f: str, path: str, bill_obj: object, archive_obj: object) -> LegislationVersion:
init_session()
with LogContext(
{
"bill_number": bill_obj["bill_number"],
Expand All @@ -474,9 476,10 @@ def parse_bill(f: str, path: str, bill_obj: object, archive_obj: object) -> Legi
new_bill_version = None
start_time = time.time()
res = []
session = Session()
try:
session = Session()
found = check_for_existing_legislation_version(bill_obj, session)

found = check_for_existing_legislation_version(bill_obj)
if found:
logging.info(f"Skipping {archive_obj.get('file')}")
return []
Expand Down Expand Up @@ -549,7 552,11 @@ def parse_bill(f: str, path: str, bill_obj: object, archive_obj: object) -> Legi
)
except Exception as e:
logging.error("Uncaught exception", exc_info=e)

finally:
try:
session.close()
except:
pass
for r in res:
if "text_element" in r:
del r["text_element"]
Expand Down Expand Up @@ -750,21 757,24 @@ def parse_archive(
names = []
rec = []
for file in archive.namelist():
parsed = filename_regex.search(file)
house = parsed.group("house")
session = parsed.group("session")
bill_number = int(parsed.group("bill_number"))
bill_version = parsed.group("bill_version")
file_title = f"{session} - {house}{bill_number} - {bill_version}"
names.append(
{
"title": file_title,
"path": file,
"bill_number": bill_number,
"bill_version": bill_version,
"chamber": chamb[house],
}
)
try:
parsed = filename_regex.search(file)
house = parsed.group("house")
session = parsed.group("session")
bill_number = int(parsed.group("bill_number"))
bill_version = parsed.group("bill_version")
file_title = f"{session} - {house}{bill_number} - {bill_version}"
names.append(
{
"title": file_title,
"path": file,
"bill_number": bill_number,
"bill_version": bill_version,
"chamber": chamb[house],
}
)
except:
pass

names = sorted(names, key=lambda x: x["bill_number"])
# names = names[50:55]
Expand Down Expand Up @@ -837,10 847,10 @@ def parse_archives(
print(file, "bad")
continue
house = parsed.group("house")
session = parsed.group("session")
congress_session = parsed.group("session")
bill_number = int(parsed.group("bill_number"))
bill_version = parsed.group("bill_version")
file_title = f"{session} - {house}{bill_number} - {bill_version}"
file_title = f"{congress_session} - {house}{bill_number} - {bill_version}"
names.append(
{
"title": file_title,
Expand Down Expand Up @@ -874,7 884,6 @@ def filter_logic(x):

return True

session = Session()
existing_legislation = retrieve_existing_legislations(session)
print("Existing legislation", len(existing_legislation))
legis_lookup: Dict[LegislationChamber, List[LegislationVersionEnum]] = defaultdict(
Expand All @@ -897,7 906,7 @@ def filter_existing_legislation(x):
names = [x for x in names if filter_logic(x) and filter_existing_legislation(x)]
print("New legislation", len(names))

frec = Parallel(n_jobs=THREADS, backend="multiprocessing", verbose=5)(
frec = Parallel(n_jobs=THREADS, backend="loky", verbose=5)(
delayed(parse_bill)(
open_archives[name["archive_index"]]
.open(name["path"], "r")
Expand Down
12 changes: 6 additions & 6 deletions backend/billparser/utils/cite_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 30,15 @@ def parse_action_for_cite(action_object: dict) -> str:
Returns:
str: A USCode citation str
"""
print(action_object)
# print(action_object)
try:
parent_cite = ""
if action_object["text_element"] is not None:
print(
etree.tostring(
action_object["text_element"], pretty_print=True, encoding="unicode"
)
)
# print(
# etree.tostring(
# action_object["text_element"], pretty_print=True, encoding="unicode"
# )
# )
xref = action_object["text_element"].find("external-xref[@legal-doc='usc']")
if xref is not None:
cite = convert_to_usc_id(xref)
Expand Down

0 comments on commit 0fe5e61

Please sign in to comment.