36
loading...
This website collects cookies to deliver better user experience
Web scraping is the process of using bots to extract content and data from a website.
"dependencies": {
"express": "^4.17.1",
"nodemailer": "^6.6.2",
"puppeteer": "^10.1.0"
},
"devDependencies": {
"eslint-config-prettier": "^8.3.0",
"eslint-plugin-prettier": "^3.4.0",
"nodemon": "^2.0.9",
"prettier": "^2.3.2"
}
Prettier
and Nodemon
come handy for having a nice experience, not mandatory though, fell free to use any other tool.scraper-template/
├── index.js
├── package.json
└── routes/
├── booking.js
└── screenshots/
├── home-page.png
└── services/
├── bookingHandler.js
├── emailSender.js
index.js
is a simple file with a 20 lines extension:const express = require('express');
const app = express();
const port = process.env.PORT || 3000;
const booking = require('./routes/booking');
app.get('/', (req, res) => {
res.json({ message: 'ok' });
});
app.use('/booking', booking);
/* Error handler middleware */
app.use((err, req, res, next) => {
const statusCode = err.statusCode || 500;
console.error(err.message, err.stack);
res.status(statusCode).json({ message: err.message });
return;
});
app.listen(port, '0.0.0.0', () => {
console.log(`Scrapper app listening at http://localhost:${port}`);
});
routes/booking.js
includes the expressjs
, services
and config
references, let's decompose it!:const express = require('express');
const router = express.Router();
...
...
...
...
const emailSender = require('../services/emailSender');
const bookingHandler = require('../services/bookingHandler');
...
...
process.env
vars, these includes keys for login(webSiteUser
, webSitePassword
), email impersonation(authUser
, appPassword
) and email receivers(emailFrom
, emailTo
):...
...
const {
webSiteUser,
webSitePassword,
authUser,
appPassword,
emailFrom,
emailTo,
preferTime,
} = require('../config');
router.get('/book-me', async function (req, res, next) {
try {
const bookMeResult = await bookingHandler.bookMe(
webSiteUser,
webSitePassword,
preferTime
);
res.send(`The result of the booking was::${bookMeResult}`);
} catch (err) {
console.error(`Error while booking me for next week`, err.message);
next(err);
}
});
...
...
...
...
router.get('/my-bookings', async function (req, res, next) {
try {
const bookingResult = await bookingHandler.myBookings(
webSiteUser,
webSitePassword
);
emailSender.sendEmail(bookingResult, {
authUser,
appPassword,
emailFrom,
emailTo,
});
res.format({
html: () => res.send(bookingResult),
});
} catch (err) {
console.error(`Error while getting the booking for this week`, err.message);
next(err);
}
});
emailSender
:bookingHandler
:const puppeteer = require('puppeteer');
puppeteer
is ready to roll!; there are plenty of examples on the internet, most of them apply all the concepts for web scraping in one single file, this is not the case.Puppeteer
works perfectly with Chronium and Nightly, for this project the reference used is the default one, with Chrome
(the web site to scrap only opens on Chrome
), but if Firefox
preferred, take a look at this thread on StackOverflow.isProduction
, this var is ready for being used when deployed on a web platform(Heroku we'll talk about it later), and another for isDev
, I repeat, this is for explanation purposes, it is not required to have 2 when one of them can be denied and cause the same result.isProduction
the launch is done headless
by default, it means that the process is done in the background without any UI, also some args
are included for a better performance, refer to the list of Chromium
flags here.isDev
, the headless
is false, and args
also include one for opening te dev tools after loading the browser.const isProduction = process.env.NODE_ENV === 'production' ? true : false;
const isDev = !isProduction;
const authenticationError = 'Failed the authentication process';
const bookingError = 'Failed the booking process';
async function startBrowser() {
let browser = null;
let context = null;
let page = null;
if (isProduction) {
browser = await puppeteer.launch({
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
page = await browser.newPage();
} else {
browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
slowMo: 75,
args: [
'--auto-open-devtools-for-tabs',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--flag-switches-begin --disable-site-isolation-trials --flag-switches-end',
],
});
context = await browser.createIncognitoBrowserContext();
page = await context.newPage();
}
return { browser, page };
}
puppeteer
features come in play:goto
: allows the navigation to a web sitetype
: types a value in an input fieldclick
: allows clicking on buttons, table cells, submitswaitForSelector
: recommended for allowing the page to recognize a particular selector before moving alongscreenshot
: takes a screenshot on demand, and store it in the app(it is possible to redirect the screenshots to remote services, in dev just place them in a root folder)
async function doLogIn(page, webSiteUser, webSitePassword) {
await page.goto(constants.baseUrl + constants.loginEndpoint, {
timeout: constants.timeOut,
waitUntil: 'load',
});
isDev && console.log('Navigation to Landing Page Succeeded!!!');
await page.type('#loginform-email', webSiteUser);
await page.type('#loginform-password', webSitePassword);
await page.click('button[type="submit"]');
isDev && console.log('Login submitted');
await page.waitForSelector('#sidebar');
isDev && (await page.screenshot({ path: 'screenshots/home-page.png' }));
return await findLink(page, constants.scheduleEndpoint);
}
loggedin
members are able to see, for finding this or any other, a function is available, which receives as parameters the page
instance and the endpoint
to look for as an href:async function findLink(page, endpoint) {
const pageLinks = await page.evaluate(() =>
Array.from(document.querySelectorAll('a[href]'), a => a.getAttribute('href')),
);
return pageLinks.includes(endpoint) || null;
}
browser
instance as parameter and close
it.async function closeBrowser(browser) {
return browser.close();
}
Puppeteer
nodemailer
Gmail
, it is mandatory to enable less secure apps, this will create a new password for just the particular application you are trying to link to, can read more here in nodemailer or in Google Supportconst nodemailer = require('nodemailer');
async function sendEmail(weekBookings, { authUser, appPassword, emailFrom, emailTo }) {
const mail = nodemailer.createTransport({
service: 'gmail',
auth: {
user: authUser,
pass: appPassword,
},
});
const mailOptions = {
from: emailFrom,
to: emailTo,
subject: 'Your bookings for this week',
html: weekBookings,
};
mail.sendMail(mailOptions, (error, info) => {
if (error) {
console.log(error);
} else {
console.log('Email sent: ' + info.response);
}
});
}
module.exports = {
sendEmail,
};
authUser
, appPassword
, email from/to
and the html
to be send as email.WEB_SITE_USER=YOUR_USER@YOUR_EMAIL_DOMAIN.com WEB_SITE_PASSWORD=YOUR_PASSWORD
[email protected] GMAIL_APP_PASSWORD=YOUR_APP_PASSWORD
[email protected] [email protected]
BOOKING_PREFER_TIME=06:55:00 npm run dev
nodemon
setting all the expected process.env
variables in port 3000 by default, so just use Postman
for hitting http://localhost:3000/booking/book-me
or http://localhost:3000/booking/my-bookings
and a result will be retrieved.Heroku's
sections, and highly suggested to use Kaffeine).process.env
passed to the terminal when running locally are set as Heroku's
environment variables, then the deploy is transparent.Captcha
, I say "sort of" cause there are ways to skip it, even some companies pay to regular users to help them to recognize captchas
, you can read more over here.reCaptcha
is ignored, some others appear right after submitting the login, so randomly fails; I opened an issue in puppeteer-extra
, an npm lib extension for puppeteer
which works hand-to-hand with 2captcha, I'm watching the issue closely, in case of getting a fix for the random issue I'll edit the post.Heroku
are done by a Cron-Job, it is fast and easy, and I received a custom email when the process randomly fails(the idea is to make it work permanently!).36