Structured natural language processing with Pandas and spaCy (code)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
nlp = spacy.load("en_core_web_sm")
sns.set_style("whitegrid")
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
data_loc_gh = "https://gist.githubusercontent.com/JakubPetriska/060958fd744ca34f099e947cd080b540/raw/963b5a9355f04741239407320ac973a6096cd7b6/quotes.csv"
df = pd.read_csv(data_loc_gh)
df.columns = df.columns.str.lower()
df["doc_id"] = df.index
df.head()
df.shape
docs = list(nlp.pipe(df.quote))
doc = nlp(df.quote[819])
print(doc)
spacy.displacy.render(doc, style="ent")
spacy.displacy.render(doc, style="ent")
spacy.displacy.render(doc, style="dep")
doc = nlp(df.quote[819])
spacy.displacy.render(doc, style="ent")
doc_nouns = list(doc.noun_chunks)
print(doc_nouns)
[(i, i.label_) for i in doc.ents]
[(i, i.ent_type_, i.is_stop) for i in doc]
type(docs)
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
"""Extract tokens and metadata from individual spaCy doc."""
return [
(i.text, i.i, i.lemma_, i.ent_type_, i.tag_,
i.dep_, i.pos_, i.is_stop, i.is_alpha,
i.is_digit, i.is_punct) for i in doc
]
def tidy_tokens(docs):
"""Extract tokens and metadata from list of spaCy docs."""
cols = [
"doc_id", "token", "token_order", "lemma",
"ent_type", "tag", "dep", "pos", "is_stop",
"is_alpha", "is_digit", "is_punct"
]
meta_df = []
for ix, doc in enumerate(docs):
meta = extract_tokens_plus_meta(doc)
meta = pd.DataFrame(meta)
meta.columns = cols[1:]
meta = meta.assign(doc_id = ix).loc[:, cols]
meta_df.append(meta)
return pd.concat(meta_df)
tidy_docs = tidy_tokens(docs)
tidy_docs.head(11)
tidy_docs.groupby("doc_id").size().hist(figsize=(14, 7), color="red", alpha=.4, bins=50);
tidy_docs.query("ent_type != ''").ent_type.value_counts()
tidy_docs.query("is_stop == False & is_punct == False").lemma.value_counts().head(10).plot(kind="barh", figsize=(24, 14), alpha=.7)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20);
(tidy_docs
.groupby("doc_id")
.apply(lambda x: x.assign(
prev_token = lambda x: x.token.shift(1),
next_token = lambda x: x.token.shift(-1))
)
.reset_index(drop=True)
.query("tag == 'POS'")
.loc[:, ["doc_id", "prev_token", "token", "next_token"]]
)