Skip to content

Commit 21d8ce1

Browse files
committed
feat(frauenhofer,-spacey): add initial spacy nlp pipeline with RE
1 parent 0d50a48 commit 21d8ce1

4 files changed

Lines changed: 340 additions & 9 deletions

File tree

immunology_kg/relations/get_frauenhofer.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pybel
1212
import pandas as pd
1313

14+
from spacify import run_nlp, SCIMODELS
1415
from utils import setup_logger
1516

1617
logger = setup_logger(name="get-frauenhofer")
@@ -39,8 +40,8 @@ def load_frauenhofer_json():
3940

4041
def get_entity_names(node):
4142
"""
42-
Recursively walk nodes to get all entity name strings
43-
in the graph and their concept.
43+
Recursively walk nodes in the json pybel graph to get all
44+
entity name strings in the graph and their concept.
4445
"""
4546
if "concept" in node:
4647
names = {}
@@ -61,7 +62,7 @@ def get_entity_names(node):
6162

6263
def get_entity_names_from_graph(data):
6364
"""
64-
Get all the entity name strings from the graph and their concept.
65+
Get all the entity name strings and their concept from the graph-as-json.
6566
"""
6667
names = {}
6768
skipped = 0
@@ -76,13 +77,14 @@ def get_entity_names_from_graph(data):
7677

7778
def collect_sentences(data):
7879
"""
79-
Read the PyBel graph-as-dict structure of the Frauenhofer dataset
80+
Read the PyBel graph-as-json-dict structure of the Frauenhofer dataset
8081
and extract all the necessary data to construct a list of lists
8182
consisting of the [sentence, source, relation, target, metadata].
8283
83-
NOTE: this is distinct from the Frauenhofer pybel csv file,
84-
in that the source and target entities are simple names here,
85-
without all the extra graph structure annotations.
84+
NOTE: this is distinct from using pybel.to_csv in that the
85+
source and target entities are simple names here, without all
86+
all the extra graph structure annotations. We still make sure to
87+
keep all of that graph metadata in the 'link' though.
8688
"""
8789
entries = []
8890
fail = 0
@@ -137,22 +139,65 @@ def collect_sentences(data):
137139
))
138140
return entries
139141

140-
def get_cited_sentences(data):
142+
def get_cited_sentences(
143+
data,
144+
csv_output='covid19_frauenhofer_annotations.csv'
145+
):
146+
"""
147+
Extract Frauenhofer sentences and relations with paper
148+
citations into a pandas dataframe.
149+
"""
141150
entries = collect_sentences(data)
142151
df = pd.DataFrame(
143152
entries,
144153
columns=['sentence', 'source', 'relation', 'target', 'link', 'pmc_id', 'doi_id']
145154
)
146155
cite_df = df.dropna(subset=['pmc_id', 'doi_id'], how="all")
147-
cite_df.to_csv('covid19_frauenhofer_annotations.csv')
156+
cite_df.to_csv(csv_output)
148157
return cite_df
149158

159+
def add_spacy_nlp_data(
160+
df,
161+
csv_output='covid19_frauenhofer_annotations_entities.csv'
162+
):
163+
"""
164+
Use SpaCy to get additional metadata about sentences in the dataframe,
165+
namely, the set of entities found by using each of the scispacy models.
166+
"""
167+
#we make a set for each sentence in the Frauenhofer dataset because
168+
#the each of the SCIMODELS might end up finding the same entities
169+
sentence_entities = [None] * len(df)
170+
tokenized_sentences = []
171+
172+
for j, model in enumerate(SCIMODELS):
173+
174+
documents = run_nlp(df['sentence'].to_list(), model=model)
175+
176+
for i, doc in enumerate(documents):
177+
178+
#we know each of these docs is actually only 1 sentence
179+
if j == 1:
180+
tokenized_sentences.append(doc.tokenized_sentences[0])
181+
ents = doc.entities[0]
182+
183+
for ent in ents:
184+
ent_str = json.dumps(ent.to_dict())
185+
if sentence_entities[i] is None:
186+
sentence_entities[i] = set()
187+
sentence_entities[i].add(ent_str)
188+
189+
df['entities'] = sentence_entities
190+
df['tokenized_sentences'] = tokenized_sentences
191+
df.to_csv(csv_output)
192+
return df
193+
150194
def main():
151195
download_frauenhofer()
152196
data = load_frauenhofer_json()
153197
cite_df = get_cited_sentences(data)
154198
logger.info("Sentence annotations with citations: {} / {} ({:.2f}%)".format(
155199
len(cite_df), len(cite_df), (len(cite_df) / len(cite_df))*100))
200+
cite_df = add_spacy_nlp_data(cite_df)
156201
logger.info("Sample:")
157202
logger.info(cite_df.head())
158203
logger.info(cite_df.tail())

immunology_kg/relations/spacify.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
import re
2+
import json
3+
from typing import Optional
4+
5+
import pandas as pd
6+
import spacy
7+
import scispacy
8+
from spacy_langdetect import LanguageDetector
9+
from scispacy.abbreviation import AbbreviationDetector
10+
from scispacy.umls_linking import UmlsEntityLinker
11+
12+
13+
SCIMODELS = [
14+
"en_core_sci_lg",
15+
"en_ner_craft_md",
16+
"en_ner_jnlpba_md",
17+
"en_ner_bc5cdr_md",
18+
"en_ner_bionlp13cg_md"
19+
]
20+
21+
stopwords_path = "stopwords-all.json"
22+
with open(stopwords_path, 'r', encoding='utf-8') as infile:
23+
STOPWORDS = json.load(infile)
24+
25+
class Document(object):
26+
"""A document with a list of tokenized sentences and other metadata."""
27+
def __init__(
28+
self,
29+
tokenized_sentences: Optional[list]=None,
30+
entities: Optional[list]=None
31+
):
32+
self.tokenized_sentences = tokenized_sentences
33+
if tokenized_sentences is None:
34+
self.tokenized_sentences = []
35+
self.entities = entities
36+
if entities is None:
37+
self.entities = []
38+
39+
def __repr__(self):
40+
return "{}({})".format(
41+
type(self).__name__,
42+
', '.join([
43+
f'tokenized_sentences="{self.tokenized_sentences}"',
44+
f'entities="{self.entities}"',
45+
])
46+
)
47+
48+
def __str__(self):
49+
return f"<{repr(self)}>"
50+
51+
class Entity(object):
52+
"""A named entity, extracted from some text."""
53+
def __init__(
54+
self,
55+
canonical_name: Optional[str]=None,
56+
token: Optional[str]=None,
57+
umls_id: Optional[str]=None,
58+
start: Optional[int]=None,
59+
end: Optional[int]=None,
60+
):
61+
self.token = token
62+
self.umls_id = umls_id
63+
self.canonical_name = canonical_name
64+
self.start = start
65+
self.end = end
66+
67+
def to_dict(self):
68+
return dict(
69+
token=f"{self.token}",
70+
start=self.start,
71+
end=self.end,
72+
umls_id=f"{self.umls_id}",
73+
canonical_name=f"{self.canonical_name}",
74+
)
75+
76+
def __repr__(self):
77+
d = self.to_dict()
78+
return "{}({})".format(
79+
type(self).__name__,
80+
', '.join([f"{k}={d[k]}" for k in d])
81+
)
82+
83+
def __str__(self):
84+
return f"<{repr(self)}>"
85+
86+
87+
def init_nlp(
88+
model: Optional[str]="en_core_sci_lg",
89+
seg_sents: Optional[bool]=False
90+
) -> tuple:
91+
"""
92+
Initialize an nlp pipeline.
93+
94+
Args:
95+
model (str): the name of an installed model from SpaCy
96+
seg_sents (bool): segment texts fed into this model into
97+
sentences first (default=False, i.e. the texts fed to the
98+
model will be a list of sentences)
99+
100+
Returns:
101+
nlp: SpaCy NLP pipeline
102+
linker: entity linker (also used in the pipeline)
103+
"""
104+
nlp = spacy.load(model)
105+
nlp.max_length=2000000
106+
107+
#don't use sentence segmentation if it's not needed
108+
if not seg_sents:
109+
nlp.add_pipe(_prevent_sbd, before='tagger')
110+
111+
#detect language to avoid parsing non-english text as if it were English
112+
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
113+
114+
#add the abbreviation pipe to the spacy pipeline
115+
abbreviation_pipe = AbbreviationDetector(nlp)
116+
nlp.add_pipe(abbreviation_pipe)
117+
118+
#linker looks ups named entities/concepts in UMLS graph, normalizes data
119+
linker = UmlsEntityLinker(resolve_abbreviations=True)
120+
nlp.add_pipe(linker)
121+
122+
return nlp, linker
123+
124+
def extract_abbrevs(doc: spacy.tokens.Doc) -> dict:
125+
"""
126+
Extract abbreviations from SpaCy doc. Return a dict of abbrev to long form.
127+
"""
128+
abbrevs = {}
129+
if len(doc._.abbreviations) > 0:
130+
for abbrev in doc._.abbreviations:
131+
#Increase length so "a" and "an" don't get un-abbreviated
132+
if len(abbrev._.long_form) > 4:
133+
abbrev_str = str(doc.text[abbrev.start_char:abbrev.end_char])
134+
abbrevs[abbrev_str] = abbrev._.long_form
135+
return abbrevs
136+
137+
def expand_abbrevs(sentence: str, abbrevs: dict) -> list:
138+
"""
139+
Return a sentence with expanded abbreviations.
140+
141+
Args:
142+
sentence (str): the sentence as a string
143+
abbrevs (dict): a dictionary of abbreviation to long form to expand
144+
145+
Returns:
146+
sent_expanded (str): the sentence with abbreviations expanded
147+
"""
148+
sent_str_expanded = sentence
149+
for k in abbrevs:
150+
sent_str_expanded = sent_str.replace(k, abbrevs[k].text)
151+
sent_expanded = sent_str_expanded.split()
152+
return sent_expanded
153+
154+
def is_stop(token, lang='en'):
155+
"""
156+
Check if token is one of the most common words in the language.
157+
Stopwords list from: https://github.com/6/stopwords-json
158+
"""
159+
stop_words = STOPWORDS[lang]
160+
if token in stop_words or token.lower() in stop_words:
161+
return True
162+
return False
163+
164+
def run_nlp(texts: list, model: Optional[str]="en_core_sci_lg") -> list:
165+
"""
166+
Extract the list of text documents into documents of tokenized sentences
167+
entities for each sentence.
168+
169+
Args:
170+
texts (list): a list of strings
171+
model (str): the name of the installed SpaCy model to use
172+
173+
Returns:
174+
documents (list): a list of Document objects (incl sents and entities)
175+
"""
176+
#load nlp in here in case we parallelize this func (e.g. w/ joblib) later?
177+
nlp, linker = init_nlp(model=model, seg_sents=False)
178+
179+
documents = []
180+
181+
#use nlp.pipe parallization from spacy, because it's faster
182+
docs = nlp.pipe(texts)
183+
for i, doc in enumerate(docs):
184+
185+
document = Document()
186+
187+
for sent in doc.sents:
188+
tokens = [token.text for token in sent]
189+
document.tokenized_sentences.append(tokens)
190+
191+
sent_ents = []
192+
for ent in sent.ents:
193+
result = char_idx_to_token_idx(
194+
' '.join(tokens),
195+
ent.start_char,
196+
ent.end_char
197+
)
198+
if not result:
199+
#entity doesn't exist in the sentence (a mistake of NER)
200+
continue
201+
202+
entity = Entity()
203+
entity.start, entity.end, _ = result
204+
entity.token = ' '.join(tokens[entity.start:entity.end])
205+
206+
if (
207+
is_stop(entity.token) or
208+
not re.search('[a-zA-Z]', str(entity.token))
209+
):
210+
continue
211+
212+
if len(ent._.umls_ents) > 0:
213+
entity.umls_id = ent._.umls_ents[0][0]
214+
name = linker.umls.cui_to_entity[entity.umls_id].canonical_name
215+
entity.canonical_name = name
216+
217+
sent_ents.append(entity)
218+
219+
document.entities.append(sent_ents)
220+
221+
documents.append(document)
222+
223+
return documents
224+
225+
def char_idx_to_token_idx(
226+
sentence: str,
227+
char_start: int,
228+
char_end: int
229+
) -> tuple:
230+
"""
231+
Convert string character indicies into token indicies, where tokens
232+
are space-separated words of the string.
233+
234+
Args:
235+
sentence (str): the sentence as a string
236+
char_start (int): the index in the string where the first character
237+
of the desired word token begins
238+
char_start (int): the index in the string where the last character
239+
of the desired word token ends
240+
241+
Returns:
242+
token_start (int): the index in the space-tokenized list of words in
243+
the sentence where the desired tokens begin (inclusive on the left)
244+
token_end (int): the index in the space-tokenized list of words in
245+
the sentence where the desired tokens end (exclusive on the right)
246+
term (int): the matched word tokens themselves
247+
"""
248+
token_end = 0
249+
token_start = 0
250+
ending = False
251+
term = ''
252+
for i, char in enumerate(sentence):
253+
if char == ' ':
254+
token_end += 1
255+
if i == char_start:
256+
token_start = token_end
257+
if i >= char_start:
258+
term += char
259+
if i == char_end:
260+
return token_start, token_end, term.strip()
261+
262+
def _prevent_sbd(doc):
263+
"""
264+
If you already have one sentence per line in your file
265+
you may wish to disable sentence segmentation with this function,
266+
which is added to the nlp pipe before the tagger
267+
"""
268+
for token in doc:
269+
token.is_sent_start = False
270+
return doc

immunology_kg/relations/stopwords-all.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)