Skip to content

Commit ada8c5f

Browse files
author
Dan Burger
committed
ASB-31862 added docstrings
1 parent 9864066 commit ada8c5f

3 files changed

Lines changed: 872 additions & 180 deletions

File tree

bibcat/core/core_utils.py

Lines changed: 169 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
a paper citation, etc.).
1010
* `cleanse_text`: Cleanse some given text, e.g., excessive whitespace and punctuation.
1111
Can also, e.g., replace citations with an 'Authoretal' placeholder of sorts.
12+
* `is_pos_conjoined`: Check if a conjoined word's original part of speech matches
13+
a given POS tag by traversing the dependency tree to the root of the conjunction chain.
1214
* `is_pos_word`: Check if some given word (of the NLP type) has a particular part of speech.
1315
* `search_text`: Search some given text for mission keywords/acronyms
1416
(e.g., search for "HST").
@@ -28,17 +30,50 @@
2830
logger.setLevel(config.logging.level)
2931

3032

31-
# Determine if given text is important (e.g., is a keyword)
3233
def check_importance(
3334
text, keyword_objs, include_Ipronouns=True, include_terms=True, include_etal=True, version_NLP=None
3435
):
3536
"""
36-
Method: check_importance
37-
WARNING! This method is *not* meant to be used directly by users.
38-
Purpose:
39-
- Checks if given text contains any important terms.
40-
- Important terms includes keywords, 1st,3rd person pronouns, etc.
41-
- Returns dictionary of bools for presence/absence of important terms.
37+
Check if given text contains any important terms.
38+
39+
Evaluates the presence of keywords, acronyms, first- and third-person
40+
pronouns, figure-related terms, and "et al." expressions. Returns a
41+
dictionary of boolean flags for each category.
42+
43+
Parameters
44+
----------
45+
text : str
46+
The input text to evaluate.
47+
keyword_objs : list
48+
Collection of keyword objects used to search for keywords and acronyms.
49+
include_Ipronouns : bool, optional
50+
Whether to check for first-person pronouns. Default is True.
51+
include_terms : bool, optional
52+
Whether to check for third-person pronouns and figure-related terms.
53+
Default is True.
54+
include_etal : bool, optional
55+
Whether to check for "et al." expressions. Default is True.
56+
version_NLP : spacy.tokens.Doc or iterable, optional
57+
Pre-computed NLP representation of ``text``. If None, it is computed
58+
internally. Default is None.
59+
60+
Returns
61+
-------
62+
dict
63+
A dictionary with the following keys:
64+
65+
- ``"bools"`` : dict of bool
66+
Boolean flags for each importance category:
67+
68+
- ``"is_keyword"`` -- text matches a keyword or acronym.
69+
- ``"is_pron_1st"`` -- text contains a first-person pronoun.
70+
- ``"is_pron_3rd"`` -- text contains a third-person pronoun.
71+
- ``"is_term_fig"`` -- text contains a figure-related term.
72+
- ``"is_etal"`` -- text contains an "et al." expression.
73+
- ``"is_any"`` -- any of the above flags is True.
74+
75+
- ``"charspans_keyword"`` : list
76+
Character spans of matched keywords within the text.
4277
"""
4378

4479
# Extract the NLP version of this text, if not given
@@ -121,14 +156,29 @@ def check_importance(
121156
return {"bools": dict_results, "charspans_keyword": charspans_keyword}
122157

123158

124-
# Cleanse given (any length) string of extra whitespace, dashes, etc.
125159
def cleanse_text(text, do_streamline_etal):
126160
"""
127-
Method: cleanse_text
128-
WARNING! This method is *not* meant to be used directly by users.
129-
Purpose:
130-
- Cleanse text of extra whitespace, punctuation, etc.
131-
- Replace paper citations (e.g. 'et al.') with uniform placeholder.
161+
Cleanse a string of extra whitespace, punctuation, and citation expressions.
162+
163+
Removes leading punctuation, normalizes whitespace, strips empty brackets
164+
and doubled punctuation, and optionally replaces author citation patterns
165+
(e.g., "Smith et al. (2020)") with a uniform placeholder string.
166+
167+
Parameters
168+
----------
169+
text : str
170+
The input text to cleanse.
171+
do_streamline_etal : bool
172+
If True, detect and replace author citation patterns such as
173+
"Author (year)", "Author & Author (year)", and "Author et al."
174+
with a configured placeholder. Bracketed citations are removed
175+
entirely; unbracketed citations are replaced with the placeholder.
176+
177+
Returns
178+
-------
179+
str
180+
The cleansed text with normalized whitespace, punctuation, and
181+
(optionally) citation expressions replaced or removed.
132182
"""
133183

134184
# Extract global punctuation expressions
@@ -220,13 +270,35 @@ def cleanse_text(text, do_streamline_etal):
220270
return text
221271

222272

223-
# Return boolean for if given word (NLP type word) is of conjoined given part of speech
224273
def is_pos_conjoined(word, pos):
225274
"""
226-
Method: is_pos_conjoined
227-
WARNING! This method is *not* meant to be used directly by users.
228-
Purpose:
229-
- Determine if original part-of-speech (p.o.s.) for given word (if conjoined) matches given p.o.s.
275+
Determine if a conjoined word's original part of speech matches a given POS tag.
276+
277+
Traverses the dependency tree upward from a conjoined word to find the
278+
root of the conjunction chain, then checks whether that root's part of
279+
speech matches ``pos``. Returns False immediately if the word is not
280+
conjoined, has no ancestors, or if ``pos`` is an auxiliary POS tag.
281+
282+
Parameters
283+
----------
284+
word : spacy.tokens.Token
285+
The NLP token to evaluate.
286+
pos : str
287+
The part-of-speech tag to match against (e.g., ``"NOUN"``,
288+
``"VERB"``). Auxiliary POS tags (as defined in
289+
``config.grammar.speech.pos_aux``) always return False.
290+
291+
Returns
292+
-------
293+
bool
294+
True if the root of the conjunction chain has a POS tag matching
295+
``pos``, False otherwise.
296+
297+
Raises
298+
------
299+
ValueError
300+
If ``word`` is conjoined but no non-conjoined ancestor can be found
301+
in the dependency tree.
230302
"""
231303

232304
# Return False if pos is aux (which may not be conjoined)
@@ -256,17 +328,66 @@ def is_pos_conjoined(word, pos):
256328
raise ValueError("Err: No original p.o.s. for conjoined word {0}!\n{1}".format(word, word_ancestors))
257329

258330

259-
# Return boolean for if given word (NLP type word) is of given part of speech
260331
def is_pos_word(word, pos, keyword_objs=None): # noqa: C901
261332
"""
262-
Method: is_pos_word
263-
WARNING! This method is *not* meant to be used directly by users.
264-
Purpose:
265-
- Return if the given word (of NLP type) is of the given part-of-speech.
266-
Note: keyword_objs is only required if pos='USELESS'.
333+
Determine if a spaCy token belongs to a given part-of-speech category.
334+
335+
Evaluates a token against a named POS category using a combination of
336+
spaCy attributes (``dep_``, ``pos_``, ``tag_``), dependency tree
337+
traversal, and grammar configuration rules. Supports a broad set of
338+
custom POS labels beyond standard spaCy tags, including structural roles
339+
such as subjects, objects, and conjunctions.
340+
341+
Parameters
342+
----------
343+
word : spacy.tokens.Token
344+
The NLP token to evaluate.
345+
pos : str
346+
The part-of-speech category to check. Must be one of:
347+
348+
- ``"ROOT"`` -- syntactic root of the sentence.
349+
- ``"VERB"`` -- main verb, including adjectival modifier verbs.
350+
- ``"USELESS"`` -- non-informative word (not a keyword, subject, or negation).
351+
- ``"SUBJECT"`` -- grammatical subject.
352+
- ``"PREPOSITION"`` -- preposition or mishandled auxiliary "to".
353+
- ``"BASE_OBJECT"`` -- direct or prepositional object (noun).
354+
- ``"DIRECT_OBJECT"`` -- object directly following a verb.
355+
- ``"PREPOSITION_OBJECT"`` -- object following a preposition.
356+
- ``"PREPOSITION_SUBJECT"`` -- subject following a preposition.
357+
- ``"MARKER"`` -- subordinating conjunction or subject marker.
358+
- ``"X"`` -- improper or foreign word.
359+
- ``"CONJOINED"`` -- word joined via conjunction or apposition.
360+
- ``"DETERMINANT"`` -- determiner (e.g., "the", "a").
361+
- ``"AUX"`` -- auxiliary verb.
362+
- ``"NOUN"`` -- noun (excluding determiners).
363+
- ``"PRONOUN"`` -- pronoun.
364+
- ``"ADJECTIVE"`` -- adjective or adjectival verb.
365+
- ``"CONJUNCTION"`` -- coordinating conjunction.
366+
- ``"PASSIVE"`` -- passive verb or auxiliary.
367+
- ``"NEGATIVE"`` -- negation word.
368+
- ``"PUNCTUATION"`` -- punctuation mark (non-alphanumeric).
369+
- ``"BRACKET"`` -- bracket character.
370+
- ``"POSSESSIVE"`` -- possessive marker.
371+
- ``"NUMBER"`` -- numeric token.
372+
373+
keyword_objs : list, optional
374+
Collection of keyword objects required when ``pos="USELESS"``.
375+
Ignored for all other POS categories. Default is None.
376+
377+
Returns
378+
-------
379+
bool
380+
True if ``word`` belongs to the specified POS category, False otherwise.
381+
382+
Raises
383+
------
384+
ValueError
385+
If ``pos="USELESS"`` and ``keyword_objs`` is None.
386+
ValueError
387+
If ``pos`` is not one of the recognized category strings listed above.
267388
"""
268389

269-
##Load global variables
390+
# Load global variables
270391
# word_i = word.i # Index
271392
word_dep = word.dep_ # dep label
272393
word_pos = word.pos_ # p.o.s. label
@@ -572,13 +693,32 @@ def is_pos_word(word, pos, keyword_objs=None): # noqa: C901
572693
return check_all
573694

574695

575-
# Search text for given keywords and acronyms and return metric
576696
def search_text(text, keyword_objs):
577697
"""
578-
Method: search_text
579-
WARNING! This method is *not* meant to be used directly by users.
580-
Purpose: Return boolean for whether or not given text contains keywords/acronyms from given keyword objects.
698+
Search text for keywords and acronyms from a collection of keyword objects.
699+
700+
Iterates over each keyword object, calling its ``identify_keyword`` method,
701+
and aggregates the results into a single boolean match flag and a combined
702+
list of character spans for all matches found.
703+
704+
Parameters
705+
----------
706+
text : str
707+
The input text to search.
708+
keyword_objs : list
709+
Collection of keyword objects, each exposing an ``identify_keyword``
710+
method that returns a dict with keys ``"bool"`` and ``"charspans"``.
711+
712+
Returns
713+
-------
714+
dict
715+
A dictionary with the following keys:
716+
717+
- ``"bool"`` : bool -- True if any keyword or acronym was found in ``text``.
718+
- ``"charspans"`` : list of tuple -- Character spans ``(start, end)``
719+
for every match across all keyword objects.
581720
"""
721+
582722
# Check if keywords and/or acronyms present in given text
583723
tmp_res = [item.identify_keyword(text) for item in keyword_objs]
584724
check_keywords = any([item["bool"] for item in tmp_res])

0 commit comments

Comments
 (0)