build setup & clean
This commit is contained in:
parent
a0cb920091
commit
c1162550f4
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,5 +1,8 @@
|
||||
# ---> Node
|
||||
# Logs
|
||||
build/**
|
||||
compiled/**
|
||||
|
||||
*.csv
|
||||
|
||||
logs
|
||||
|
2223
package-lock.json
generated
2223
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
19
package.json
19
package.json
@ -2,9 +2,21 @@
|
||||
"name": "@aterve/contenteggstockcrawler",
|
||||
"version": "1.0.2",
|
||||
"description": "",
|
||||
"main": "./dist/core.js",
|
||||
"main": "build/cli.js",
|
||||
"bin": {
|
||||
"crawler": "build/cli.js"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"package-linux": "pkg . --targets node18-linux-x64 --output compiled/crawler-linux64",
|
||||
"package-win": "pkg . --targets node18-windows-x64 --output compiled/crawler-win64",
|
||||
"build": "tsc"
|
||||
},
|
||||
"pkg": {
|
||||
"scripts": "build/**/*.js",
|
||||
"assets": [
|
||||
"node_modules/**"
|
||||
]
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
@ -25,6 +37,7 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/got": "^9.6.12",
|
||||
"@types/jsonfile": "^6.1.1"
|
||||
"@types/jsonfile": "^6.1.1",
|
||||
"pkg": "^5.8.1"
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
|
||||
import jsonfile from "jsonfile";
|
||||
import cheerio from "cheerio";
|
||||
import cheerio, { CheerioAPI } from "cheerio";
|
||||
import fs from "fs";
|
||||
import { getDomain } from "tldts";
|
||||
import { Browser, launch } from "puppeteer";
|
||||
@ -41,11 +41,17 @@ async function start() {
|
||||
let params = process.argv.slice(2);
|
||||
sitemapUrl = params[0];
|
||||
|
||||
// Crawl Sitemap
|
||||
console.log(`Crawling input sitemap: ${sitemapUrl}`);
|
||||
const sitemapRequest = await axios.get(sitemapUrl);
|
||||
const $ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
||||
|
||||
let $: CheerioAPI = null;
|
||||
try {
|
||||
// Crawl Sitemap
|
||||
console.log(`Crawling input sitemap: ${sitemapUrl}`);
|
||||
const sitemapRequest = await axios.get(sitemapUrl);
|
||||
$ = cheerio.load(sitemapRequest.data, { xmlMode: true });
|
||||
} catch (error) {
|
||||
console.log("[ERROR] Error occurred crawling main sitemap.")
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
$('loc').each(function() {
|
||||
const url = $(this).text();
|
||||
|
@ -22,14 +22,10 @@ export namespace LDJsonParser {
|
||||
$('script[type="application/ld+json"]').each((index, element) => {
|
||||
snippets.push($(element).html());
|
||||
});
|
||||
|
||||
console.log("Found " + snippets.length + " snippets");
|
||||
|
||||
for (const snippet of snippets) {
|
||||
try {
|
||||
let json = JSON.parse(snippet);
|
||||
|
||||
console.log(json);
|
||||
|
||||
if (json["@type"]) {
|
||||
if (json["@type"].includes("Product")) {
|
||||
|
@ -6,7 +6,7 @@
|
||||
"noImplicitAny": true,
|
||||
"moduleResolution": "node",
|
||||
"sourceMap": true,
|
||||
"outDir": "dist",
|
||||
"outDir": "build",
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"*": ["node_modules/*", "src/types/*"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user