Back to packages list

Vals using node-html-parser

Description from the NPM package:
A very fast HTML parser, generating a simplified DOM, with basic element query support.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import { extractContent } from "https://esm.town/v/iakovos/extractContent";
import { fetchText } from "https://esm.town/v/iakovos/fetchText";
import { getLink } from "https://esm.town/v/iakovos/getLink";
import { getMediaAttributeUrl } from "https://esm.town/v/iakovos/getMediaAttributeUrl";
export const fetchAndParseFeeds = async (url: string): Promise<FeedItem[]> => {
try {
const { xml2js } = await import("https://deno.land/x/xml2js@1.0.0/mod.ts");
let { parse } = await import("npm:node-html-parser");
const xml = await fetchText(url);
if (!xml) {
return [];
}
const json = xml2js(xml, { compact: true });
const items = json.rss?.channel?.item || json.feed?.entry;
const parsedItems: FeedItem[] = items?.map((item) => {
const content = extractContent(item);
const document = content && parse(content);
const mediaThumbnail = getMediaAttributeUrl(
item["media:thumbnail"],
);
const mediaContent = getMediaAttributeUrl(
item["media:content"],
);
const paragraph = document?.querySelector("p:not(.caption)")?.textContent?.trim() ?? "";
const imgTag = document?.querySelector("img");
const imgSrc = imgTag?.getAttribute("src");
const image = imgSrc || mediaThumbnail || mediaContent || null;
const link = getLink(item);
const { _text, _cdata } = item.title ?? {};
const title = _text ?? _cdata ?? "";
const pubDate = item.pubDate?._text ?? item.updated?._text ?? "";
const description = paragraph || item.description?._text
|| item.description?._cdata || "";
return { title, description, image, pubDate, link };
}) ?? [];
return parsedItems;
} catch (error) {
console.error("Error while fetching and parsing feeds:", error);
return [];
}
};
interface FeedItem {
title: string;
description: string;
image: string | null;
pubDate: string;
link: string;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
// Parses an HTML table out of an HTML string
// Returns an array of objects
// Assumes only a single table in the HTML string
// With a standard tr>th and tr>td structure
export const parseTable = async (html: string) => {
let { parse } = await import("npm:node-html-parser");
let _ = await import("npm:lodash-es");
let dom = parse(html);
let keys = [...dom.querySelector("tr").querySelectorAll("th")].map((th) =>
th.textContent.trim()
);
let rows = [...dom.querySelectorAll("tr")].map((tr) =>
[...tr.querySelectorAll("td")].map((td) => {
let links = [...td.querySelectorAll("a")].map((a) => ({
text: a.textContent,
href: a.getAttribute("href"),
}));
if (links.length)
return links;
let spans = [
...new Set(
[...td.querySelectorAll("span")].map((span) => span.textContent),
),
];
if (spans.length > 1)
return spans;
else
return td.textContent || undefined;
})
).slice(1);
return rows.map((r) => Object.fromEntries(_.zip(keys, r)));
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText";
export async function valTownBlogJSON(): Promise<Blog[]> {
let { parse } = await import("npm:node-html-parser");
const text = await fetchText("https://blog.val.town");
return [
...parse(text)
.querySelectorAll(".notion-collection-list__item"),
].map((article) => ({
url: "https://blog.val.town" +
article
.querySelector(".notion-collection-list__item-anchor")
.getAttribute("href"),
title: article
.querySelector(".notion-semantic-string")
.querySelectorAll("span").at(-1).innerText,
date: article.querySelector(".date")?.innerText,
description: "",
}));
}
interface Blog {
url: string;
title: string;
date: string;
}
1
Next