36
loading...
This website collects cookies to deliver better user experience
jupyter notebook
from your anaconda prompt.#import the required packages
import requests
from bs4 import BeautifulSoup
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
#get the dataset from link
dataset='https://www.gutenberg.org/files/1661/1661-h/1661-h.htm'
reading=requests.get(dataset)
html=reading.text
# extract the text using web scraping tool
data=BeautifulSoup(html,"html5lib")
data.title
// OUTPUT
<title>The Project Gutenberg Book of The Adventures of Sherlock Holmes, by Arthur Conan Doyle</title>
# [:12] gives the first 12 elements in the matched array
data.findAll('a')[:12]
// OUTPUT
[<a href="https://www.gutenberg.org">www.gutenberg.org</a>,
<a href="https://www.gutenberg.org/ebooks/48320">
[ #48320 ]</a>,
<a href="#chap01">A Scandal in Bohemia</a>,
<a href="#chap02">The Red-Headed League</a>,
<a href="#chap03">A Case of Identity</a>,
<a href="#chap04">The Boscombe Valley Mystery</a>,
<a href="#chap05">The Five Orange Pips</a>,
<a href="#chap06">The Man with the Twisted Lip</a>,
<a href="#chap07">The Adventure of the Blue Carbuncle</a>,
<a href="#chap08">The Adventure of the Speckled Band</a>,
<a href="#chap09">The Adventure of the Engineer’s Thumb</a>,
<a href="#chap10">The Adventure of the Noble Bachelor</a>]
# tokenize the text with regular expressions
# "w+": This expression matches the alphanumeric character in the text
text=data.get_text()
token=re.findall('\w+', text)
token[:10]
// OUTPUT
['The',
'Project',
'Gutenberg',
'Book',
'of',
'The',
'Adventures',
'of',
'Sherlock',
'Holmes']
words=[]
for word in token:
words.append(word.lower())
words[:8]
// OUTPUT
['the', 'project', 'gutenberg', 'book', 'of', 'the', 'adventures', 'of']
# download the package
nltk.download("stopwords")
// OUTPUT
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\milind\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
True
# remove stop words
sw=nltk.corpus.stopwords.words('english')
sw[:5]
// OUTPUT
['i', 'me', 'my', 'myself', 'we']
# get the list without stop words
words_ne=[]
for word in words:
if word not in sw:
words_ne.append(word)
words_ne[:5]
// OUTPUT
['project', 'gutenberg', 'book', 'adventures', 'sherlock']
sns.set_style('darkgrid')
nlp_words=nltk.FreqDist(words_ne)
nlp_words.plot(20);