import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
nlp = spacy.load("en_core_web_sm")

sns.set_style("whitegrid")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

data_loc_gh = "https://gist.githubusercontent.com/JakubPetriska/060958fd744ca34f099e947cd080b540/raw/963b5a9355f04741239407320ac973a6096cd7b6/quotes.csv"
df = pd.read_csv(data_loc_gh)
df.columns = df.columns.str.lower()

df["doc_id"] = df.index

df.head()

df.shape

(1664, 3)

docs = list(nlp.pipe(df.quote))

doc = nlp(df.quote[819]) 
print(doc)

Yesterday is history. Tomorrow is a mystery. And today? Today is a gift. That is why we call it the present.

spacy.displacy.render(doc, style="ent")

spacy.displacy.render(doc, style="ent")

/Users/conormcdonald/opt/anaconda3/lib/python3.7/site-packages/spacy/displacy/__init__.py:189: UserWarning: [W006] No entities to visualize found in Doc object. If this is surprising to you, make sure the Doc was processed using a model that supports named entity recognition, and check the `doc.ents` property manually if necessary.
  warnings.warn(Warnings.W006)

spacy.displacy.render(doc, style="dep")

doc = nlp(df.quote[819])
spacy.displacy.render(doc, style="ent")
doc_nouns = list(doc.noun_chunks)
print(doc_nouns)

[history, Tomorrow, a mystery, And today, Today, a gift, we, it, the present]

[(i, i.label_) for i in doc.ents]

[(Yesterday, 'DATE'), (Tomorrow, 'DATE'), (today, 'DATE'), (Today, 'DATE')]

[(i, i.ent_type_, i.is_stop) for i in doc]

[(Genius, '', False),
 (is, '', True),
 (one, 'PERCENT', True),
 (percent, 'PERCENT', False),
 (inspiration, '', False),
 (and, '', True),
 (ninety, 'PERCENT', False),
 (-, 'PERCENT', False),
 (nine, 'PERCENT', True),
 (percent, 'PERCENT', False),
 (perspiration, '', False),
 (., '', False)]

type(docs)

list

def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    """Extract tokens and metadata from individual spaCy doc."""
    return [
        (i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
         i.dep_, i.pos_, i.is_stop, i.is_alpha, 
         i.is_digit, i.is_punct) for i in doc
    ]

def tidy_tokens(docs):
    """Extract tokens and metadata from list of spaCy docs."""
    
    cols = [
        "doc_id", "token", "token_order", "lemma", 
        "ent_type", "tag", "dep", "pos", "is_stop", 
        "is_alpha", "is_digit", "is_punct"
    ]
    
    meta_df = []
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        meta = pd.DataFrame(meta)
        meta.columns = cols[1:]
        meta = meta.assign(doc_id = ix).loc[:, cols]
        meta_df.append(meta)
        
    return pd.concat(meta_df)

tidy_docs = tidy_tokens(docs)

tidy_docs.head(11)

tidy_docs.groupby("doc_id").size().hist(figsize=(14, 7), color="red", alpha=.4, bins=50);

tidy_docs.query("ent_type != ''").ent_type.value_counts()

DATE           111
CARDINAL        85
ORDINAL         37
ORG             29
TIME            22
PERSON          15
PERCENT         14
QUANTITY         9
NORP             5
GPE              5
FAC              5
WORK_OF_ART      2
LOC              1
PRODUCT          1
Name: ent_type, dtype: int64

tidy_docs.query("is_stop == False & is_punct == False").lemma.value_counts().head(10).plot(kind="barh", figsize=(24, 14), alpha=.7)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20);

(tidy_docs
 .groupby("doc_id")
 .apply(lambda x: x.assign(
     prev_token = lambda x: x.token.shift(1), 
     next_token = lambda x: x.token.shift(-1))
       )
 .reset_index(drop=True)
 .query("tag == 'POS'")
 .loc[:, ["doc_id", "prev_token", "token", "next_token"]]
)

	author	quote	doc_id
0	Thomas Edison	Genius is one percent inspiration and ninety-n...	0
1	Yogi Berra	You can observe a lot just by watching.	1
2	Abraham Lincoln	A house divided against itself cannot stand.	2
3	Johann Wolfgang von Goethe	Difficulties increase the nearer we get to the...	3
4	Byron Pulsifer	Fate is in your hands and no one elses	4

	doc_id	prev_token	token	next_token
156	16	else	's	paper
1147	124	one	's	courage
1470	156	someone	's	prayers
2357	242	today	's	games
2780	277	world	's	great
2789	277	world	's	true
9037	593	life	's	failures
10880	674	man	's	fault
11218	689	Life	's	challenges
11345	695	life	's	energy
13101	813	Life	's	challenges
13862	849	one	's	inner
17525	1040	one	's	own
17836	1063	today	's	disappointments
17843	1063	tomorrow	's	dreams
24725	1449	else	's	plan
25614	1499	people	's	hearts
25656	1501	man	's	dreams
26455	1538	another	's	path

	token	token_order	lemma	ent_type	tag	dep	pos	is_stop	is_alpha	is_digit	is_punct
0	Genius	0	genius		NN	nsubj	NOUN	False	True	False	False
1	is	1	be		VBZ	ROOT	AUX	True	True	False	False
2	one	2	one	PERCENT	CD	nummod	NUM	True	True	False	False
3	percent	3	percent	PERCENT	NN	compound	NOUN	False	True	False	False
4	inspiration	4	inspiration		NN	attr	NOUN	False	True	False	False
5	and	5	and		CC	cc	CCONJ	True	True	False	False
6	ninety	6	ninety	PERCENT	CD	compound	NUM	False	True	False	False
7	-	7	-	PERCENT	HYPH	punct	PUNCT	False	False	False	True
8	nine	8	nine	PERCENT	CD	nummod	NUM	True	True	False	False
9	percent	9	percent	PERCENT	NN	compound	NOUN	False	True	False	False
10	perspiration	10	perspiration		NN	conj	NOUN	False	True	False	False