build setup & clean

This commit is contained in:
Nick Leeman 2023-06-30 14:16:16 +02:00
parent a0cb920091
commit c1162550f4
6 changed files with 2254 additions and 15 deletions

3
.gitignore vendored
View File

@ -1,5 +1,8 @@
# ---> Node # ---> Node
# Logs # Logs
build/**
compiled/**
*.csv *.csv
logs logs

2223
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -2,9 +2,21 @@
"name": "@aterve/contenteggstockcrawler", "name": "@aterve/contenteggstockcrawler",
"version": "1.0.2", "version": "1.0.2",
"description": "", "description": "",
"main": "./dist/core.js", "main": "build/cli.js",
"bin": {
"crawler": "build/cli.js"
},
"scripts": { "scripts": {
"test": "echo \"Error: no test specified\" && exit 1" "test": "echo \"Error: no test specified\" && exit 1",
"package-linux": "pkg . --targets node18-linux-x64 --output compiled/crawler-linux64",
"package-win": "pkg . --targets node18-windows-x64 --output compiled/crawler-win64",
"build": "tsc"
},
"pkg": {
"scripts": "build/**/*.js",
"assets": [
"node_modules/**"
]
}, },
"repository": { "repository": {
"type": "git", "type": "git",
@ -25,6 +37,7 @@
}, },
"devDependencies": { "devDependencies": {
"@types/got": "^9.6.12", "@types/got": "^9.6.12",
"@types/jsonfile": "^6.1.1" "@types/jsonfile": "^6.1.1",
"pkg": "^5.8.1"
} }
} }

View File

@ -1,6 +1,6 @@
import jsonfile from "jsonfile"; import jsonfile from "jsonfile";
import cheerio from "cheerio"; import cheerio, { CheerioAPI } from "cheerio";
import fs from "fs"; import fs from "fs";
import { getDomain } from "tldts"; import { getDomain } from "tldts";
import { Browser, launch } from "puppeteer"; import { Browser, launch } from "puppeteer";
@ -41,10 +41,16 @@ async function start() {
let params = process.argv.slice(2); let params = process.argv.slice(2);
sitemapUrl = params[0]; sitemapUrl = params[0];
// Crawl Sitemap let $: CheerioAPI = null;
console.log(`Crawling input sitemap: ${sitemapUrl}`); try {
const sitemapRequest = await axios.get(sitemapUrl); // Crawl Sitemap
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true }); console.log(`Crawling input sitemap: ${sitemapUrl}`);
const sitemapRequest = await axios.get(sitemapUrl);
$ = cheerio.load(sitemapRequest.data, { xmlMode: true });
} catch (error) {
console.log("[ERROR] Error occurred crawling main sitemap.")
process.exit(0);
}
$('loc').each(function() { $('loc').each(function() {
const url = $(this).text(); const url = $(this).text();

View File

@ -23,14 +23,10 @@ export namespace LDJsonParser {
snippets.push($(element).html()); snippets.push($(element).html());
}); });
console.log("Found " + snippets.length + " snippets");
for (const snippet of snippets) { for (const snippet of snippets) {
try { try {
let json = JSON.parse(snippet); let json = JSON.parse(snippet);
console.log(json);
if (json["@type"]) { if (json["@type"]) {
if (json["@type"].includes("Product")) { if (json["@type"].includes("Product")) {
if (Array.isArray(json["offers"])) { if (Array.isArray(json["offers"])) {

View File

@ -6,7 +6,7 @@
"noImplicitAny": true, "noImplicitAny": true,
"moduleResolution": "node", "moduleResolution": "node",
"sourceMap": true, "sourceMap": true,
"outDir": "dist", "outDir": "build",
"baseUrl": ".", "baseUrl": ".",
"paths": { "paths": {
"*": ["node_modules/*", "src/types/*"] "*": ["node_modules/*", "src/types/*"]