basic crawling system
This commit is contained in:
129
src/core.ts
Normal file
129
src/core.ts
Normal file
@@ -0,0 +1,129 @@
|
||||
import puppeteer from "puppeteer";
|
||||
import jsonfile from "jsonfile";
|
||||
import cheerio from "cheerio";
|
||||
import { getDomain } from "tldts";
|
||||
|
||||
// Import website modules
|
||||
import {
|
||||
FitnessKoerier,
|
||||
FitnessApparaat
|
||||
} from "./modules/websites";
|
||||
|
||||
start();
|
||||
|
||||
// Globals
|
||||
let browser: puppeteer.Browser = null;
|
||||
|
||||
async function start() {
|
||||
try {
|
||||
console.log(`---- ContentEggStockCrawler ----`);
|
||||
|
||||
console.log(`Starting crawler browser..`);
|
||||
browser = await puppeteer.launch({ headless: true });
|
||||
|
||||
const urls = await jsonfile.readFile("./urls-check.json");
|
||||
console.log(`Loaded ${urls.length} urls from file! (urls-check.json)`);
|
||||
|
||||
console.log(`Initialization done!`);
|
||||
console.log(`------------------------------------- \n`);
|
||||
|
||||
for (let url of urls) {
|
||||
await crawlBlogPage(url);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`An Error Occured!`, error);
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlBlogPage(url: string) {
|
||||
try {
|
||||
console.log(`Crawling blog page: ${url}`);
|
||||
|
||||
// Open new page and goto url
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url, { waitUntil: 'networkidle2' });
|
||||
|
||||
// Get html from page
|
||||
const html = await page.evaluate(() => document.body.innerHTML);
|
||||
|
||||
// Close page
|
||||
await page.close();
|
||||
|
||||
// Load html in cheerio object
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let products: Array<any> = [];
|
||||
|
||||
console.log(`Detected ${$(".row-products").length} content egg products!`);
|
||||
|
||||
$(".row-products .cegg-list-logo-title a").each((index, element) => {
|
||||
let url = $(element).attr("href");
|
||||
let name = $(element).html().trim();
|
||||
|
||||
products.push({
|
||||
name: name,
|
||||
blogUrl: url,
|
||||
url: "",
|
||||
domain: "",
|
||||
inStock: false,
|
||||
});
|
||||
});
|
||||
|
||||
console.log("Checking product stocks...");
|
||||
|
||||
for (let index in products) {
|
||||
let status = await crawlProductStock(products[index].blogUrl);
|
||||
|
||||
products[index].domain = status[0];
|
||||
products[index].url = status[1];
|
||||
products[index].inStock = status[2];
|
||||
|
||||
if (products[index].inStock) {
|
||||
console.log(`✅ ${products[index].name} - ${products[index].domain}`);
|
||||
} else {
|
||||
console.log(`❌ ${products[index].name} - ${products[index].domain}`);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.log("Error while trying to crawl page! Skipping...");
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlProductStock(url: string) {
|
||||
try {
|
||||
// Open new page and goto url
|
||||
const page = await browser.newPage();
|
||||
await page.goto(url, { waitUntil: 'networkidle2' });
|
||||
|
||||
// Get html from page
|
||||
const html = await page.evaluate(() => document.body.innerHTML);
|
||||
|
||||
// Load html in cheerio object
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Get domain from url
|
||||
const domain = getDomain(page.url());
|
||||
|
||||
// Close page
|
||||
await page.close();
|
||||
|
||||
switch (domain) {
|
||||
case "fitnesskoerier.nl":
|
||||
return [domain, page.url(), await FitnessKoerier.check(html)];
|
||||
|
||||
case "fitnessapparaat.nl":
|
||||
return [domain, page.url(), await FitnessApparaat.check(html)];
|
||||
|
||||
default:
|
||||
console.error(`-- ${domain} is not an available website module! Can't check stock!`);
|
||||
return ["NOT_MODULE", false];
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
console.log("Error while trying to crawl page! Skipping...");
|
||||
}
|
||||
}
|
||||
|
||||
|
46
src/modules/websites.ts
Normal file
46
src/modules/websites.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import cheerio from "cheerio";
|
||||
import { isTypeReferenceNode } from "typescript";
|
||||
|
||||
export namespace Template {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
return false;
|
||||
} catch (error) {
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export namespace FitnessKoerier {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
if ($('.product-list .stock-msg').find("[itemprop=availability]").attr("content") == "out_of_stock") {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export namespace FitnessApparaat {
|
||||
export async function check(html: string) {
|
||||
try {
|
||||
const $ = cheerio.load(html);
|
||||
console.log($('[itemprop="offers"').find(".stock-red").length);
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
console.error(`Error occured during stock check!`);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user