Debugging
Inspector CLI
node --inspect server.js # Start debugging.
node --inspect-brk server.js # Start debugging and break.
Enable core modules debug information:
NODE_DEBUG=fs,net,stream yarn test
Node.js Web Crawler
const axios = require('axios')
const cheerio = require('cheerio')
const playwright = require('playwright')
const url = 'https://scrapeme.live/shop/page/1/'
const useHeadless = false // "true" to use playwright
const maxVisits = 30 // Arbitrary number for the maximum of links visited
const visited = new Set()
const allProducts = []
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms))
async function getHtmlPlaywright(url) {
const browser = await playwright.firefox.launch()
const context = await browser.newContext()
const page = await context.newPage()
await page.goto(url)
const html = await page.content()
await browser.close()
return html
}
async function getHtmlAxios(url) {
const { data } = await axios.get(url)
return data
}
async function getHtml(url) {
return useHeadless ? await getHtmlPlaywright(url) : await getHtmlAxios(url)
}
function extractContent($) {
return $('.product')
.map((_, product) => {
const $product = $(product)
return {
id: $product.find('a[data-product_id]').attr('data-product_id'),
title: $product.find('h2').text(),
price: $product.find('.price').text(),
}
})
.toArray()
}
function extractLinks($) {
return [
...new Set(
$('.page-numbers a')
.map((_, a) => $(a).attr('href'))
.toArray()
),
]
}
async function crawl(url) {
visited.add(url)
console.log('Crawl: ', url)
const html = await getHtml(url)
const $ = cheerio.load(html)
const content = extractContent($)
const links = extractLinks($)
links
.filter(link => !visited.has(link))
.forEach((link) => {
q.enqueue(crawlTask, link)
})
allProducts.push(...content)
// We can see how the list grows. Gotta catch 'em all!
console.log(allProducts.length)
}
// Change the default concurrency or pass it as param
function queue(concurrency = 4) {
let running = 0
const tasks = []
return {
enqueue: async (task, ...params) => {
tasks.push({ task, params })
if (running >= concurrency)
return
++running
while (tasks.length) {
const { task, params } = tasks.shift()
await task(...params)
}
--running
},
}
}
async function crawlTask(url) {
if (visited.size >= maxVisits) {
console.log('Over Max Visits, exiting')
return
}
if (visited.has(url))
return
await crawl(url)
}
const q = queue()
q.enqueue(crawlTask, url)
Web scraping with impersonation:
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto('https://example.com')
// Now your Puppeteer script is enhanced with advanced evasion techniques
// Proceed with your web scraping tasks
await browser.close()
})()
const { chromium, devices } = require('playwright')
const iPhone11 = devices['iPhone 11'];
(async () => {
const browser = await chromium.launch()
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+ '(KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
geolocation: { latitude: 48.8584, longitude: 2.2945 }, // Paris, France
permissions: ['geolocation'],
locale: 'fr-FR',
...iPhone11,
})
const page = await context.newPage()
await page.goto('https://example.com')
// Your scraping logic here
await browser.close()
})()