basic crawling system

This commit is contained in:
2020-11-13 22:23:33 +01:00
parent 09d814102e
commit 4fb8a19661
6 changed files with 791 additions and 0 deletions

129
src/core.ts Normal file
View File

@@ -0,0 +1,129 @@
import puppeteer from "puppeteer";
import jsonfile from "jsonfile";
import cheerio from "cheerio";
import { getDomain } from "tldts";
// Import website modules
import {
FitnessKoerier,
FitnessApparaat
} from "./modules/websites";
start();
// Globals
let browser: puppeteer.Browser = null;
async function start() {
try {
console.log(`---- ContentEggStockCrawler ----`);
console.log(`Starting crawler browser..`);
browser = await puppeteer.launch({ headless: true });
const urls = await jsonfile.readFile("./urls-check.json");
console.log(`Loaded ${urls.length} urls from file! (urls-check.json)`);
console.log(`Initialization done!`);
console.log(`------------------------------------- \n`);
for (let url of urls) {
await crawlBlogPage(url);
}
} catch (error) {
console.error(`An Error Occured!`, error);
}
}
async function crawlBlogPage(url: string) {
try {
console.log(`Crawling blog page: ${url}`);
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Close page
await page.close();
// Load html in cheerio object
const $ = cheerio.load(html);
let products: Array<any> = [];
console.log(`Detected ${$(".row-products").length} content egg products!`);
$(".row-products .cegg-list-logo-title a").each((index, element) => {
let url = $(element).attr("href");
let name = $(element).html().trim();
products.push({
name: name,
blogUrl: url,
url: "",
domain: "",
inStock: false,
});
});
console.log("Checking product stocks...");
for (let index in products) {
let status = await crawlProductStock(products[index].blogUrl);
products[index].domain = status[0];
products[index].url = status[1];
products[index].inStock = status[2];
if (products[index].inStock) {
console.log(`${products[index].name} - ${products[index].domain}`);
} else {
console.log(`${products[index].name} - ${products[index].domain}`);
}
}
} catch (error) {
console.log("Error while trying to crawl page! Skipping...");
}
}
async function crawlProductStock(url: string) {
try {
// Open new page and goto url
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Get html from page
const html = await page.evaluate(() => document.body.innerHTML);
// Load html in cheerio object
const $ = cheerio.load(html);
// Get domain from url
const domain = getDomain(page.url());
// Close page
await page.close();
switch (domain) {
case "fitnesskoerier.nl":
return [domain, page.url(), await FitnessKoerier.check(html)];
case "fitnessapparaat.nl":
return [domain, page.url(), await FitnessApparaat.check(html)];
default:
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
return ["NOT_MODULE", false];
}
} catch (error) {
console.error(error);
console.log("Error while trying to crawl page! Skipping...");
}
}

46
src/modules/websites.ts Normal file
View File

@@ -0,0 +1,46 @@
import cheerio from "cheerio";
import { isTypeReferenceNode } from "typescript";
export namespace Template {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
return false;
} catch (error) {
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace FitnessKoerier {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
if ($('.product-list .stock-msg').find("[itemprop=availability]").attr("content") == "out_of_stock") {
return false;
} else {
return true;
}
} catch (error) {
console.error(`Error occured during stock check!`);
return false;
}
}
}
export namespace FitnessApparaat {
export async function check(html: string) {
try {
const $ = cheerio.load(html);
console.log($('[itemprop="offers"').find(".stock-red").length);
} catch (error) {
console.log(error);
console.error(`Error occured during stock check!`);
return false;
}
}
}