74
loading...
This website collects cookies to deliver better user experience
library(rvest)
url <- "https://www.imdb.com/search/title?year=2018"
imdb <- read_html(url)
head(imdb)
genre_data_html <- html_nodes(imdb, ".genre")
genre_data <- html_text(genre_data_html)
head(genre_data)
#remove the \n in front of the genres
genre_data <- gsub("\n", "", genre_data)
#remove the spaces between genres
genre_data <- gsub(" ", "", genre_data)
#display only the first genre in the list
genre_data <- gsub(",.*", "", genre_data)
#plot the number of movies by genre
library(ggplot2)
ggplot(imdb_df, aes(x=genre_data)) +
geom_bar(color="purple", fill="green", alpha=0.3) +
ggtitle("Number of movies by genre") +
xlab("Genre") + ylab("Number of movies")
#plot the movies by runtime
barplot(table(imdb_df$Runtime))
hist(imdb_df$Runtime)
ggplot(imdb_df, aes(x=runtime_data)) +
geom_histogram(color="purple", fill="green", alpha=0.3) +
ggtitle("Distribution of movie runtimes") +
xlab("Minutes") + ylab("Number of movies")
#group movies by genre
library(dplyr)
genre_cat = group_by(imdb_df, Genre)
genre_runtime = summarize(genre_cat, Minutes=mean(Runtime))
plot(genre_runtime)
counts = table(genre_runtime$Genre, genre_runtime$Minutes)
ggplot(data=genre_runtime, aes(x=Genre, y=Minutes)) +
geom_bar(stat = "identity", color="purple", fill="green", alpha=0.3) +
ggtitle("Mean movie duration by genre")