CoronaWhy
diff --git a/‎immunology_kg/relations/get_frauenhofer.py‎
Lines changed: 54 additions & 9 deletions b/‎immunology_kg/relations/get_frauenhofer.py‎
Lines changed: 54 additions & 9 deletions
diff --git a/‎immunology_kg/relations/spacify.py‎
Lines changed: 270 additions & 0 deletions b/‎immunology_kg/relations/spacify.py‎
Lines changed: 270 additions & 0 deletions
diff --git a/‎immunology_kg/relations/stopwords-all.json‎
Lines changed: 1 addition & 0 deletions b/‎immunology_kg/relations/stopwords-all.json‎
Lines changed: 1 addition & 0 deletions
@@ -11,6 +11,7 @@
 import pybel
 import pandas as pd
 
+from spacify import run_nlp, SCIMODELS 
 from utils import setup_logger
 
 logger = setup_logger(name="get-frauenhofer")
@@ -39,8 +40,8 @@ def load_frauenhofer_json():
 
 def get_entity_names(node):
     """
-    Recursively walk nodes to get all entity name strings 
-    in the graph and their concept.
+    Recursively walk nodes in the json pybel graph to get all 
+    entity name strings in the graph and their concept.
     """
     if "concept" in node:
         names = {}
@@ -61,7 +62,7 @@ def get_entity_names(node):
 
 def get_entity_names_from_graph(data):
     """
-    Get all the entity name strings from the graph and their concept.
+    Get all the entity name strings and their concept from the graph-as-json.
     """
     names = {}
     skipped = 0
@@ -76,13 +77,14 @@ def get_entity_names_from_graph(data):
 
 def collect_sentences(data):
     """
-    Read the PyBel graph-as-dict structure of the Frauenhofer dataset
+    Read the PyBel graph-as-json-dict structure of the Frauenhofer dataset
     and extract all the necessary data to construct a list of lists
     consisting of the [sentence, source, relation, target, metadata].
 
-    NOTE: this is distinct from the Frauenhofer pybel csv file, 
-    in that the source and target entities are simple names here, 
-    without all the extra graph structure annotations. 
+    NOTE: this is distinct from using pybel.to_csv in that the 
+    source and target entities are simple names here, without all
+    all the extra graph structure annotations. We still make sure to
+    keep all of that graph metadata in the 'link' though.
     """
     entries = []
     fail = 0
@@ -137,22 +139,65 @@ def collect_sentences(data):
     ))
     return entries
 
-def get_cited_sentences(data):
+def get_cited_sentences(
+        data,
+        csv_output='covid19_frauenhofer_annotations.csv'
+    ):
+    """
+    Extract Frauenhofer sentences and relations with paper
+    citations into a pandas dataframe.
+    """
     entries = collect_sentences(data)
     df = pd.DataFrame(
         entries,
         columns=['sentence', 'source', 'relation', 'target', 'link', 'pmc_id', 'doi_id']
     )
     cite_df = df.dropna(subset=['pmc_id', 'doi_id'], how="all")
-    cite_df.to_csv('covid19_frauenhofer_annotations.csv')
+    cite_df.to_csv(csv_output)
     return cite_df
 
+def add_spacy_nlp_data(
+        df, 
+        csv_output='covid19_frauenhofer_annotations_entities.csv'
+    ):
+    """
+    Use SpaCy to get additional metadata about sentences in the dataframe,
+    namely, the set of entities found by using each of the scispacy models. 
+    """
+    #we make a set for each sentence in the Frauenhofer dataset because
+    #the each of the SCIMODELS might end up finding the same entities
+    sentence_entities = [None] * len(df)
+    tokenized_sentences = []
+
+    for j, model in enumerate(SCIMODELS):
+
+        documents = run_nlp(df['sentence'].to_list(), model=model)
+
+        for i, doc in enumerate(documents):
+            
+            #we know each of these docs is actually only 1 sentence
+            if j == 1:
+                tokenized_sentences.append(doc.tokenized_sentences[0])
+            ents = doc.entities[0]
+
+            for ent in ents:
+                ent_str = json.dumps(ent.to_dict())
+                if sentence_entities[i] is None:
+                    sentence_entities[i] = set()
+                sentence_entities[i].add(ent_str)
+
+    df['entities'] = sentence_entities
+    df['tokenized_sentences'] = tokenized_sentences
+    df.to_csv(csv_output)
+    return df
+
 def main():
     download_frauenhofer()
     data = load_frauenhofer_json()
     cite_df = get_cited_sentences(data)
     logger.info("Sentence annotations with citations: {} / {} ({:.2f}%)".format(
         len(cite_df), len(cite_df), (len(cite_df) / len(cite_df))*100))
+    cite_df = add_spacy_nlp_data(cite_df)
     logger.info("Sample:")
     logger.info(cite_df.head())
     logger.info(cite_df.tail())
 
@@ -0,0 +1,270 @@
+import re
+import json
+from typing import Optional
+
+import pandas as pd
+import spacy
+import scispacy
+from spacy_langdetect import LanguageDetector
+from scispacy.abbreviation import AbbreviationDetector
+from scispacy.umls_linking import UmlsEntityLinker
+
+
+SCIMODELS = [
+    "en_core_sci_lg", 
+    "en_ner_craft_md", 
+    "en_ner_jnlpba_md", 
+    "en_ner_bc5cdr_md", 
+    "en_ner_bionlp13cg_md"
+]
+
+stopwords_path = "stopwords-all.json"
+with open(stopwords_path, 'r', encoding='utf-8') as infile:
+    STOPWORDS = json.load(infile)
+
+class Document(object):
+    """A document with a list of tokenized sentences and other metadata."""
+    def __init__(
+            self,
+            tokenized_sentences: Optional[list]=None,
+            entities: Optional[list]=None
+        ):
+        self.tokenized_sentences = tokenized_sentences
+        if tokenized_sentences is None:
+            self.tokenized_sentences = []
+        self.entities = entities
+        if entities is None:
+            self.entities = []
+
+    def __repr__(self):
+        return "{}({})".format(
+            type(self).__name__,
+            ', '.join([
+                f'tokenized_sentences="{self.tokenized_sentences}"',
+                f'entities="{self.entities}"',
+            ])
+        )
+
+    def __str__(self):
+        return f"<{repr(self)}>" 
+
+class Entity(object):
+    """A named entity, extracted from some text."""
+    def __init__(
+            self, 
+            canonical_name: Optional[str]=None,
+            token: Optional[str]=None, 
+            umls_id: Optional[str]=None, 
+            start: Optional[int]=None,
+            end: Optional[int]=None,
+        ):
+        self.token = token
+        self.umls_id = umls_id
+        self.canonical_name = canonical_name
+        self.start = start
+        self.end = end
+
+    def to_dict(self):
+        return dict(
+            token=f"{self.token}",
+            start=self.start,
+            end=self.end,
+            umls_id=f"{self.umls_id}",
+            canonical_name=f"{self.canonical_name}",
+        )
+
+    def __repr__(self):
+        d = self.to_dict()
+        return "{}({})".format(
+            type(self).__name__,
+            ', '.join([f"{k}={d[k]}" for k in d])
+        )
+
+    def __str__(self):
+        return f"<{repr(self)}>" 
+
+
+def init_nlp(
+        model: Optional[str]="en_core_sci_lg", 
+        seg_sents: Optional[bool]=False
+    ) -> tuple:
+    """
+    Initialize an nlp pipeline.
+
+    Args:
+        model (str): the name of an installed model from SpaCy
+        seg_sents (bool): segment texts fed into this model into
+            sentences first (default=False, i.e. the texts fed to the
+            model will be a list of sentences)
+
+    Returns:
+        nlp: SpaCy NLP pipeline
+        linker: entity linker (also used in the pipeline)
+    """
+    nlp = spacy.load(model)
+    nlp.max_length=2000000
+
+    #don't use sentence segmentation if it's not needed
+    if not seg_sents:
+        nlp.add_pipe(_prevent_sbd, before='tagger')
+
+    #detect language to avoid parsing non-english text as if it were English
+    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
+
+    #add the abbreviation pipe to the spacy pipeline
+    abbreviation_pipe = AbbreviationDetector(nlp)
+    nlp.add_pipe(abbreviation_pipe)
+
+    #linker looks ups named entities/concepts in UMLS graph, normalizes data
+    linker = UmlsEntityLinker(resolve_abbreviations=True)
+    nlp.add_pipe(linker)
+    
+    return nlp, linker
+
+def extract_abbrevs(doc: spacy.tokens.Doc) -> dict:
+    """
+    Extract abbreviations from SpaCy doc. Return a dict of abbrev to long form.
+    """
+    abbrevs = {}
+    if len(doc._.abbreviations) > 0:
+        for abbrev in doc._.abbreviations:
+            #Increase length so "a" and "an" don't get un-abbreviated
+            if len(abbrev._.long_form) > 4: 
+                abbrev_str = str(doc.text[abbrev.start_char:abbrev.end_char])
+                abbrevs[abbrev_str] = abbrev._.long_form
+    return abbrevs
+
+def expand_abbrevs(sentence: str, abbrevs: dict) -> list:
+    """
+    Return a sentence with expanded abbreviations.
+
+    Args:
+        sentence (str): the sentence as a string 
+        abbrevs (dict): a dictionary of abbreviation to long form to expand
+
+    Returns:
+        sent_expanded (str): the sentence with abbreviations expanded
+    """
+    sent_str_expanded = sentence
+    for k in abbrevs:
+        sent_str_expanded = sent_str.replace(k, abbrevs[k].text)
+    sent_expanded = sent_str_expanded.split()
+    return sent_expanded
+
+def is_stop(token, lang='en'):
+    """
+    Check if token is one of the most common words in the language.
+    Stopwords list from: https://github.com/6/stopwords-json
+    """
+    stop_words = STOPWORDS[lang]
+    if token in stop_words or token.lower() in stop_words:
+        return True
+    return False
+
+def run_nlp(texts: list, model: Optional[str]="en_core_sci_lg") -> list:
+    """
+    Extract the list of text documents into documents of tokenized sentences 
+    entities for each sentence.
+
+    Args:
+        texts (list): a list of strings
+        model (str): the name of the installed SpaCy model to use
+    
+    Returns:
+        documents (list): a list of Document objects (incl sents and entities)
+    """
+    #load nlp in here in case we parallelize this func (e.g. w/ joblib) later?
+    nlp, linker = init_nlp(model=model, seg_sents=False)
+
+    documents = []
+
+    #use nlp.pipe parallization from spacy, because it's faster
+    docs = nlp.pipe(texts)
+    for i, doc in enumerate(docs):
+
+        document = Document()
+        
+        for sent in doc.sents:
+            tokens = [token.text for token in sent]
+            document.tokenized_sentences.append(tokens)
+
+            sent_ents = []
+            for ent in sent.ents:
+                result = char_idx_to_token_idx(
+                    ' '.join(tokens), 
+                    ent.start_char, 
+                    ent.end_char  
+                )
+                if not result:
+                    #entity doesn't exist in the sentence (a mistake of NER) 
+                    continue
+
+                entity = Entity()
+                entity.start, entity.end, _ = result
+                entity.token = ' '.join(tokens[entity.start:entity.end])
+
+                if (
+                        is_stop(entity.token) or 
+                        not re.search('[a-zA-Z]', str(entity.token))
+                   ):
+                    continue
+            
+                if len(ent._.umls_ents) > 0:
+                    entity.umls_id = ent._.umls_ents[0][0]
+                    name = linker.umls.cui_to_entity[entity.umls_id].canonical_name
+                    entity.canonical_name = name
+
+                sent_ents.append(entity)
+
+            document.entities.append(sent_ents)
+
+        documents.append(document)
+
+    return documents 
+
+def char_idx_to_token_idx(
+        sentence: str, 
+        char_start: int, 
+        char_end: int
+    ) -> tuple:
+    """
+    Convert string character indicies into token indicies, where tokens
+    are space-separated words of the string.
+
+    Args:
+        sentence (str): the sentence as a string
+        char_start (int): the index in the string where the first character
+            of the desired word token begins
+        char_start (int): the index in the string where the last character
+            of the desired word token ends
+
+    Returns:
+        token_start (int): the index in the space-tokenized list of words in
+            the sentence where the desired tokens begin (inclusive on the left)
+        token_end (int): the index in the space-tokenized list of words in
+            the sentence where the desired tokens end (exclusive on the right)
+        term (int): the matched word tokens themselves
+    """
+    token_end = 0
+    token_start = 0
+    ending = False
+    term = ''
+    for i, char in enumerate(sentence):
+        if char == ' ':
+            token_end += 1
+        if i == char_start:
+            token_start = token_end 
+        if i >= char_start:
+            term += char
+        if i == char_end:
+            return token_start, token_end, term.strip()
+
+def _prevent_sbd(doc):
+    """
+    If you already have one sentence per line in your file
+    you may wish to disable sentence segmentation with this function,
+    which is added to the nlp pipe before the tagger
+    """
+    for token in doc:
+        token.is_sent_start = False
+    return doc