55
loading...
This website collects cookies to deliver better user experience
# Installing the Python library to read the PDF
!pip install pypdf2
import PyPDF2
#URL where the book is located
ULR_livro = './drive/MyDrive/machado_assis/pixarAvulsos.pdf'
#leading the location indicated
book = open (ULR_book, ‘rb’)
#Reading the book
pdfReader = PyPDF2.PdfFileReader (book)
text = '' #var where all the text of the book will be
#The tale is only between pages 3 and 32
for num in range (3, 32):
page = pdfReader.getPage (num)
text = text + page.extractText ()
#Package installation
!pip install nltk
#Installation of word dictionaries (corpus)
!python -m nltk.downloader all
#dividing our text into sentences and then into words
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
sentencas = sent_tokenize(text)
palavras = word_tokenize(text.lower())
#Removing the stopwords
from nltk.corpus import stopwords
from string import punctuation
stopwords = set(stopwords.words('portuguese') + list(punctuation))
palavras_sem_stopwords = [palavra for palavra in palavras if palavra not in stopwords]
#Creating the frequency distribution
from nltk.probability import FreqDist
frequencia = FreqDist(palavras_sem_stopwords)
#Separating the most important sentences
from collections import defaultdict
sentencas_importantes = defaultdict(int)
#Loop to go through all the sentences and collect all the statistics
for i, sentenca in enumerate(sentencas):
for palavra in word_tokenize(sentenca.lower()):
if palavra in frequencia:
sentencas_importantes[i] += frequencia[palavra]
#"n" most important sentences
from heapq import nlargest
idx_sentencas_importantes = nlargest(4, sentencas_importantes, sentencas_importantes.get)
# We have the summary! :)
resumo = ''
for i in sorted(idx_sentencas_importantes):
resumo = resumo + sentencas[i]
!pip install gTTS
from gtts
import gTTSimport ostts = gTTS(resumo, lang='pt-br')
tts.save('resumo.mp3')