Skip to content

Commit

Permalink
fix: ensure always consistent column output of pos_proportions
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Oct 31, 2023
1 parent 09e6993 commit c12ba20
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions src/textdescriptives/components/pos_proportions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 43,24 @@ def pos_proportions(self, text: Union[Doc, Span]) -> dict:
Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with
POSTAG.
"""
pos_counts = Counter()
if self.add_all_tags:
pos_counts: Counter = Counter(self.model_tags) # type: ignore
# add all tags to the counter so they are included in the output
pos_counts.update(self.model_tags)
# reset all counts to 0
pos_counts.subtract(self.model_tags)

else:
pos_counts: Counter = Counter() # type: ignore

if self.use_pos:
pos_counts.update([token.pos_ for token in text])
else:
pos_counts.update([token.tag_ for token in text])

if self.add_all_tags:
# filter out tags that are not in self.model_tags
pos_counts = {
tag: count for tag, count in pos_counts.items() if tag in self.model_tags
}

len_text = len(text)
return {
f"pos_prop_{tag}": count / len(text) if len_text > 0 else np.nan
Expand Down

0 comments on commit c12ba20

Please sign in to comment.