|
9 | 9 | a paper citation, etc.). |
10 | 10 | * `cleanse_text`: Cleanse some given text, e.g., excessive whitespace and punctuation. |
11 | 11 | Can also, e.g., replace citations with an 'Authoretal' placeholder of sorts. |
| 12 | +* `is_pos_conjoined`: Check if a conjoined word's original part of speech matches |
| 13 | + a given POS tag by traversing the dependency tree to the root of the conjunction chain. |
12 | 14 | * `is_pos_word`: Check if some given word (of the NLP type) has a particular part of speech. |
13 | 15 | * `search_text`: Search some given text for mission keywords/acronyms |
14 | 16 | (e.g., search for "HST"). |
|
28 | 30 | logger.setLevel(config.logging.level) |
29 | 31 |
|
30 | 32 |
|
31 | | -# Determine if given text is important (e.g., is a keyword) |
32 | 33 | def check_importance( |
33 | 34 | text, keyword_objs, include_Ipronouns=True, include_terms=True, include_etal=True, version_NLP=None |
34 | 35 | ): |
35 | 36 | """ |
36 | | - Method: check_importance |
37 | | - WARNING! This method is *not* meant to be used directly by users. |
38 | | - Purpose: |
39 | | - - Checks if given text contains any important terms. |
40 | | - - Important terms includes keywords, 1st,3rd person pronouns, etc. |
41 | | - - Returns dictionary of bools for presence/absence of important terms. |
| 37 | + Check if given text contains any important terms. |
| 38 | +
|
| 39 | + Evaluates the presence of keywords, acronyms, first- and third-person |
| 40 | + pronouns, figure-related terms, and "et al." expressions. Returns a |
| 41 | + dictionary of boolean flags for each category. |
| 42 | +
|
| 43 | + Parameters |
| 44 | + ---------- |
| 45 | + text : str |
| 46 | + The input text to evaluate. |
| 47 | + keyword_objs : list |
| 48 | + Collection of keyword objects used to search for keywords and acronyms. |
| 49 | + include_Ipronouns : bool, optional |
| 50 | + Whether to check for first-person pronouns. Default is True. |
| 51 | + include_terms : bool, optional |
| 52 | + Whether to check for third-person pronouns and figure-related terms. |
| 53 | + Default is True. |
| 54 | + include_etal : bool, optional |
| 55 | + Whether to check for "et al." expressions. Default is True. |
| 56 | + version_NLP : spacy.tokens.Doc or iterable, optional |
| 57 | + Pre-computed NLP representation of ``text``. If None, it is computed |
| 58 | + internally. Default is None. |
| 59 | +
|
| 60 | + Returns |
| 61 | + ------- |
| 62 | + dict |
| 63 | + A dictionary with the following keys: |
| 64 | +
|
| 65 | + - ``"bools"`` : dict of bool |
| 66 | + Boolean flags for each importance category: |
| 67 | +
|
| 68 | + - ``"is_keyword"`` -- text matches a keyword or acronym. |
| 69 | + - ``"is_pron_1st"`` -- text contains a first-person pronoun. |
| 70 | + - ``"is_pron_3rd"`` -- text contains a third-person pronoun. |
| 71 | + - ``"is_term_fig"`` -- text contains a figure-related term. |
| 72 | + - ``"is_etal"`` -- text contains an "et al." expression. |
| 73 | + - ``"is_any"`` -- any of the above flags is True. |
| 74 | +
|
| 75 | + - ``"charspans_keyword"`` : list |
| 76 | + Character spans of matched keywords within the text. |
42 | 77 | """ |
43 | 78 |
|
44 | 79 | # Extract the NLP version of this text, if not given |
@@ -121,14 +156,29 @@ def check_importance( |
121 | 156 | return {"bools": dict_results, "charspans_keyword": charspans_keyword} |
122 | 157 |
|
123 | 158 |
|
124 | | -# Cleanse given (any length) string of extra whitespace, dashes, etc. |
125 | 159 | def cleanse_text(text, do_streamline_etal): |
126 | 160 | """ |
127 | | - Method: cleanse_text |
128 | | - WARNING! This method is *not* meant to be used directly by users. |
129 | | - Purpose: |
130 | | - - Cleanse text of extra whitespace, punctuation, etc. |
131 | | - - Replace paper citations (e.g. 'et al.') with uniform placeholder. |
| 161 | + Cleanse a string of extra whitespace, punctuation, and citation expressions. |
| 162 | +
|
| 163 | + Removes leading punctuation, normalizes whitespace, strips empty brackets |
| 164 | + and doubled punctuation, and optionally replaces author citation patterns |
| 165 | + (e.g., "Smith et al. (2020)") with a uniform placeholder string. |
| 166 | +
|
| 167 | + Parameters |
| 168 | + ---------- |
| 169 | + text : str |
| 170 | + The input text to cleanse. |
| 171 | + do_streamline_etal : bool |
| 172 | + If True, detect and replace author citation patterns such as |
| 173 | + "Author (year)", "Author & Author (year)", and "Author et al." |
| 174 | + with a configured placeholder. Bracketed citations are removed |
| 175 | + entirely; unbracketed citations are replaced with the placeholder. |
| 176 | +
|
| 177 | + Returns |
| 178 | + ------- |
| 179 | + str |
| 180 | + The cleansed text with normalized whitespace, punctuation, and |
| 181 | + (optionally) citation expressions replaced or removed. |
132 | 182 | """ |
133 | 183 |
|
134 | 184 | # Extract global punctuation expressions |
@@ -220,13 +270,35 @@ def cleanse_text(text, do_streamline_etal): |
220 | 270 | return text |
221 | 271 |
|
222 | 272 |
|
223 | | -# Return boolean for if given word (NLP type word) is of conjoined given part of speech |
224 | 273 | def is_pos_conjoined(word, pos): |
225 | 274 | """ |
226 | | - Method: is_pos_conjoined |
227 | | - WARNING! This method is *not* meant to be used directly by users. |
228 | | - Purpose: |
229 | | - - Determine if original part-of-speech (p.o.s.) for given word (if conjoined) matches given p.o.s. |
| 275 | + Determine if a conjoined word's original part of speech matches a given POS tag. |
| 276 | +
|
| 277 | + Traverses the dependency tree upward from a conjoined word to find the |
| 278 | + root of the conjunction chain, then checks whether that root's part of |
| 279 | + speech matches ``pos``. Returns False immediately if the word is not |
| 280 | + conjoined, has no ancestors, or if ``pos`` is an auxiliary POS tag. |
| 281 | +
|
| 282 | + Parameters |
| 283 | + ---------- |
| 284 | + word : spacy.tokens.Token |
| 285 | + The NLP token to evaluate. |
| 286 | + pos : str |
| 287 | + The part-of-speech tag to match against (e.g., ``"NOUN"``, |
| 288 | + ``"VERB"``). Auxiliary POS tags (as defined in |
| 289 | + ``config.grammar.speech.pos_aux``) always return False. |
| 290 | +
|
| 291 | + Returns |
| 292 | + ------- |
| 293 | + bool |
| 294 | + True if the root of the conjunction chain has a POS tag matching |
| 295 | + ``pos``, False otherwise. |
| 296 | +
|
| 297 | + Raises |
| 298 | + ------ |
| 299 | + ValueError |
| 300 | + If ``word`` is conjoined but no non-conjoined ancestor can be found |
| 301 | + in the dependency tree. |
230 | 302 | """ |
231 | 303 |
|
232 | 304 | # Return False if pos is aux (which may not be conjoined) |
@@ -256,17 +328,66 @@ def is_pos_conjoined(word, pos): |
256 | 328 | raise ValueError("Err: No original p.o.s. for conjoined word {0}!\n{1}".format(word, word_ancestors)) |
257 | 329 |
|
258 | 330 |
|
259 | | -# Return boolean for if given word (NLP type word) is of given part of speech |
260 | 331 | def is_pos_word(word, pos, keyword_objs=None): # noqa: C901 |
261 | 332 | """ |
262 | | - Method: is_pos_word |
263 | | - WARNING! This method is *not* meant to be used directly by users. |
264 | | - Purpose: |
265 | | - - Return if the given word (of NLP type) is of the given part-of-speech. |
266 | | - Note: keyword_objs is only required if pos='USELESS'. |
| 333 | + Determine if a spaCy token belongs to a given part-of-speech category. |
| 334 | +
|
| 335 | + Evaluates a token against a named POS category using a combination of |
| 336 | + spaCy attributes (``dep_``, ``pos_``, ``tag_``), dependency tree |
| 337 | + traversal, and grammar configuration rules. Supports a broad set of |
| 338 | + custom POS labels beyond standard spaCy tags, including structural roles |
| 339 | + such as subjects, objects, and conjunctions. |
| 340 | +
|
| 341 | + Parameters |
| 342 | + ---------- |
| 343 | + word : spacy.tokens.Token |
| 344 | + The NLP token to evaluate. |
| 345 | + pos : str |
| 346 | + The part-of-speech category to check. Must be one of: |
| 347 | +
|
| 348 | + - ``"ROOT"`` -- syntactic root of the sentence. |
| 349 | + - ``"VERB"`` -- main verb, including adjectival modifier verbs. |
| 350 | + - ``"USELESS"`` -- non-informative word (not a keyword, subject, or negation). |
| 351 | + - ``"SUBJECT"`` -- grammatical subject. |
| 352 | + - ``"PREPOSITION"`` -- preposition or mishandled auxiliary "to". |
| 353 | + - ``"BASE_OBJECT"`` -- direct or prepositional object (noun). |
| 354 | + - ``"DIRECT_OBJECT"`` -- object directly following a verb. |
| 355 | + - ``"PREPOSITION_OBJECT"`` -- object following a preposition. |
| 356 | + - ``"PREPOSITION_SUBJECT"`` -- subject following a preposition. |
| 357 | + - ``"MARKER"`` -- subordinating conjunction or subject marker. |
| 358 | + - ``"X"`` -- improper or foreign word. |
| 359 | + - ``"CONJOINED"`` -- word joined via conjunction or apposition. |
| 360 | + - ``"DETERMINANT"`` -- determiner (e.g., "the", "a"). |
| 361 | + - ``"AUX"`` -- auxiliary verb. |
| 362 | + - ``"NOUN"`` -- noun (excluding determiners). |
| 363 | + - ``"PRONOUN"`` -- pronoun. |
| 364 | + - ``"ADJECTIVE"`` -- adjective or adjectival verb. |
| 365 | + - ``"CONJUNCTION"`` -- coordinating conjunction. |
| 366 | + - ``"PASSIVE"`` -- passive verb or auxiliary. |
| 367 | + - ``"NEGATIVE"`` -- negation word. |
| 368 | + - ``"PUNCTUATION"`` -- punctuation mark (non-alphanumeric). |
| 369 | + - ``"BRACKET"`` -- bracket character. |
| 370 | + - ``"POSSESSIVE"`` -- possessive marker. |
| 371 | + - ``"NUMBER"`` -- numeric token. |
| 372 | +
|
| 373 | + keyword_objs : list, optional |
| 374 | + Collection of keyword objects required when ``pos="USELESS"``. |
| 375 | + Ignored for all other POS categories. Default is None. |
| 376 | +
|
| 377 | + Returns |
| 378 | + ------- |
| 379 | + bool |
| 380 | + True if ``word`` belongs to the specified POS category, False otherwise. |
| 381 | +
|
| 382 | + Raises |
| 383 | + ------ |
| 384 | + ValueError |
| 385 | + If ``pos="USELESS"`` and ``keyword_objs`` is None. |
| 386 | + ValueError |
| 387 | + If ``pos`` is not one of the recognized category strings listed above. |
267 | 388 | """ |
268 | 389 |
|
269 | | - ##Load global variables |
| 390 | + # Load global variables |
270 | 391 | # word_i = word.i # Index |
271 | 392 | word_dep = word.dep_ # dep label |
272 | 393 | word_pos = word.pos_ # p.o.s. label |
@@ -572,13 +693,32 @@ def is_pos_word(word, pos, keyword_objs=None): # noqa: C901 |
572 | 693 | return check_all |
573 | 694 |
|
574 | 695 |
|
575 | | -# Search text for given keywords and acronyms and return metric |
576 | 696 | def search_text(text, keyword_objs): |
577 | 697 | """ |
578 | | - Method: search_text |
579 | | - WARNING! This method is *not* meant to be used directly by users. |
580 | | - Purpose: Return boolean for whether or not given text contains keywords/acronyms from given keyword objects. |
| 698 | + Search text for keywords and acronyms from a collection of keyword objects. |
| 699 | +
|
| 700 | + Iterates over each keyword object, calling its ``identify_keyword`` method, |
| 701 | + and aggregates the results into a single boolean match flag and a combined |
| 702 | + list of character spans for all matches found. |
| 703 | +
|
| 704 | + Parameters |
| 705 | + ---------- |
| 706 | + text : str |
| 707 | + The input text to search. |
| 708 | + keyword_objs : list |
| 709 | + Collection of keyword objects, each exposing an ``identify_keyword`` |
| 710 | + method that returns a dict with keys ``"bool"`` and ``"charspans"``. |
| 711 | +
|
| 712 | + Returns |
| 713 | + ------- |
| 714 | + dict |
| 715 | + A dictionary with the following keys: |
| 716 | +
|
| 717 | + - ``"bool"`` : bool -- True if any keyword or acronym was found in ``text``. |
| 718 | + - ``"charspans"`` : list of tuple -- Character spans ``(start, end)`` |
| 719 | + for every match across all keyword objects. |
581 | 720 | """ |
| 721 | + |
582 | 722 | # Check if keywords and/or acronyms present in given text |
583 | 723 | tmp_res = [item.identify_keyword(text) for item in keyword_objs] |
584 | 724 | check_keywords = any([item["bool"] for item in tmp_res]) |
|
0 commit comments