import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp = spacy.load("en_core_web_sm")

sns.set_style("whitegrid")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
data_loc_gh = "https://gist.githubusercontent.com/JakubPetriska/060958fd744ca34f099e947cd080b540/raw/963b5a9355f04741239407320ac973a6096cd7b6/quotes.csv"
df = pd.read_csv(data_loc_gh)
df.columns = df.columns.str.lower()
df["doc_id"] = df.index
df.head()
author quote doc_id
0 Thomas Edison Genius is one percent inspiration and ninety-n... 0
1 Yogi Berra You can observe a lot just by watching. 1
2 Abraham Lincoln A house divided against itself cannot stand. 2
3 Johann Wolfgang von Goethe Difficulties increase the nearer we get to the... 3
4 Byron Pulsifer Fate is in your hands and no one elses 4
df.shape
(1664, 3)
docs = list(nlp.pipe(df.quote))
doc = nlp(df.quote[819]) 
print(doc)
Yesterday is history. Tomorrow is a mystery. And today? Today is a gift. That is why we call it the present.
spacy.displacy.render(doc, style="ent")
Yesterday DATE is history. Tomorrow DATE is a mystery. And today DATE ? Today DATE is a gift. That is why we call it the present.
spacy.displacy.render(doc, style="ent")
/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/spacy/displacy/__init__.py:189: UserWarning: [W006] No entities to visualize found in Doc object. If this is surprising to you, make sure the Doc was processed using a model that supports named entity recognition, and check the `doc.ents` property manually if necessary.
  warnings.warn(Warnings.W006)
A house divided against itself cannot stand.
spacy.displacy.render(doc, style="dep")
Yesterday NOUN is AUX history. NOUN Tomorrow NOUN is AUX a DET mystery. NOUN And CCONJ today? NOUN Today NOUN is AUX a DET gift. NOUN That DET is AUX why ADV we PRON call VERB it PRON the DET present. NOUN npadvmod attr nsubj det attr cc nsubj det attr nsubj advmod nsubj ccomp dobj det oprd
doc = nlp(df.quote[819])
spacy.displacy.render(doc, style="ent")
doc_nouns = list(doc.noun_chunks)
print(doc_nouns)
Yesterday DATE is history. Tomorrow DATE is a mystery. And today DATE ? Today DATE is a gift. That is why we call it the present.
[history, Tomorrow, a mystery, And today, Today, a gift, we, it, the present]
[(i, i.label_) for i in doc.ents]
[(Yesterday, 'DATE'), (Tomorrow, 'DATE'), (today, 'DATE'), (Today, 'DATE')]
[(i, i.ent_type_, i.is_stop) for i in doc]
[(Genius, '', False),
 (is, '', True),
 (one, 'PERCENT', True),
 (percent, 'PERCENT', False),
 (inspiration, '', False),
 (and, '', True),
 (ninety, 'PERCENT', False),
 (-, 'PERCENT', False),
 (nine, 'PERCENT', True),
 (percent, 'PERCENT', False),
 (perspiration, '', False),
 (., '', False)]
type(docs)
list
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    """Extract tokens and metadata from individual spaCy doc."""
    return [
        (i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
         i.dep_, i.pos_, i.is_stop, i.is_alpha, 
         i.is_digit, i.is_punct) for i in doc
    ]

def tidy_tokens(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "lemma", 
        "ent_type", "tag", "dep", "pos", "is_stop", 
        "is_alpha", "is_digit", "is_punct"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)        
tidy_docs = tidy_tokens(docs)
tidy_docs.head(11)
doc_id token token_order lemma ent_type tag dep pos is_stop is_alpha is_digit is_punct
0 0 Genius 0 genius NN nsubj NOUN False True False False
1 0 is 1 be VBZ ROOT AUX True True False False
2 0 one 2 one PERCENT CD nummod NUM True True False False
3 0 percent 3 percent PERCENT NN compound NOUN False True False False
4 0 inspiration 4 inspiration NN attr NOUN False True False False
5 0 and 5 and CC cc CCONJ True True False False
6 0 ninety 6 ninety PERCENT CD compound NUM False True False False
7 0 - 7 - PERCENT HYPH punct PUNCT False False False True
8 0 nine 8 nine PERCENT CD nummod NUM True True False False
9 0 percent 9 percent PERCENT NN compound NOUN False True False False
10 0 perspiration 10 perspiration NN conj NOUN False True False False
tidy_docs.groupby("doc_id").size().hist(figsize=(14, 7), color="red", alpha=.4, bins=50);
tidy_docs.query("ent_type != ''").ent_type.value_counts()
DATE           111
CARDINAL        85
ORDINAL         37
ORG             29
TIME            22
PERSON          15
PERCENT         14
QUANTITY         9
NORP             5
GPE              5
FAC              5
WORK_OF_ART      2
LOC              1
PRODUCT          1
Name: ent_type, dtype: int64
tidy_docs.query("is_stop == False & is_punct == False").lemma.value_counts().head(10).plot(kind="barh", figsize=(24, 14), alpha=.7)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20);
(tidy_docs
 .groupby("doc_id")
 .apply(lambda x: x.assign(
     prev_token = lambda x: x.token.shift(1), 
     next_token = lambda x: x.token.shift(-1))
       )
 .reset_index(drop=True)
 .query("tag == 'POS'")
 .loc[:, ["doc_id", "prev_token", "token", "next_token"]]
)
doc_id prev_token token next_token
156 16 else 's paper
1147 124 one 's courage
1470 156 someone 's prayers
2357 242 today 's games
2780 277 world 's great
2789 277 world 's true
9037 593 life 's failures
10880 674 man 's fault
11218 689 Life 's challenges
11345 695 life 's energy
13101 813 Life 's challenges
13862 849 one 's inner
17525 1040 one 's own
17836 1063 today 's disappointments
17843 1063 tomorrow 's dreams
24725 1449 else 's plan
25614 1499 people 's hearts
25656 1501 man 's dreams
26455 1538 another 's path