Natural Language Processing: Tokenize and Plot
Words matter.
Import Packages
#Import Packages
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import nltk
Generate a “Bag-Of-Words”
#Extracts contents of a file, returns a bag of words(count of each word)
def count_words(document):
#Load target File as 'document' to 'raw_file'
raw_file = open(document).read()
#Tokenize all words in 'raw_file' using word_tokenize()
token_list = nltk.tokenize.word_tokenize(raw_file)
#Convert each item in token list to lower_case list
lowercase_tokens = [each.lower() for each in token_list if each.isalpha()]
#Create list of each word and associated count
bag_Of_Words = Counter(lowercase_tokens)
return bag_Of_Words
Check our Bag Of Words
Series
word_series = pd.Series(count_words(your_doc))
Describe
print(word_series.describe())
Plotting Counts
#Given a particular bag of words, desired counts, creates a dataframe,
#resets index, plots counts
def plot_counts(bag_of_words, amount):
#Convert bag_of_words array to a dataframe defined by top 'amount'
#ex '10' for top 10 words in array
df = pd.DataFrame(bag_of_words.most_common(amount))
#Sets index to column '0', which holds the string value for each entry,
#col '1' contains the count
df.set_index(0,drop=True,inplace=True)
#Define a bar plot
df.plot(kind="bar", legend = False)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.suptitle("Top " + str(amount)+ " Most Frequent Words")
#Display plot
plt.show()
Code Summary
Below is a class version of the above code for you to play with, note that you’ll need to specify your specific file path where I’ve written ‘your_document’ in the parameter for run.count_words(your_document)
#Import Packages
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from collections import Counter
#nltk.download('punkt') - Use if error raised at runtime
class BagOfWordsTokenizer(object):
#Extracts contents of a file, returns a bag of words(count of each word)
def count_words(self, document):
#Load target File as 'document' to 'raw_file'
raw_file = open(document).read()
#Tokenize all words in 'raw_file' using word_tokenize()
token_list = nltk.tokenize.word_tokenize(raw_file)
#Convert each item in token list to lower_case list
lowercase_tokens = [each.lower() for each in token_list if each.isalpha()]
#Create list of each word and associated count
bag_Of_Words = Counter(lowercase_tokens)
return bag_Of_Words
#Given a particular bag of words, desired counts, creates a
#dataframe, resets index, plots counts
def plot_counts(self, bag_of_words, amount):
#Convert bag_of_words array to a dataframe defined by top 'amount'
#ex '10' for top 10 words in array
df = pd.DataFrame(bag_of_words.most_common(amount))
#Sets index to column '0', which holds the string value for
#each entry, col '1' contains the count
df.set_index(0,drop=True,inplace=True)
#Define a bar plot
df.plot(kind="bar")
#Display plot
plt.show()
if __name__ == "__main__":
run = BagOfWordsTokenizer()
wordBag = run.count_words(your_document)
howMany = input(print("How many of the top words would you like to be displayed?"))
run.plot_counts(wordBag, howMany)
Written on March 21, 2018