Back to packages list

Vals using cheerio

Description from the NPM package:
Tiny, fast, and elegant implementation of core jQuery designed specifically for the server
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import cheerio from "npm:cheerio";
const xpath = "/html/body/div[3]/table[1]/tbody/tr[2]/td/table/tbody/tr[2]/td[3]";
function getElementByXpath(path) {
return;
}
export const webscrapeMinhaBibliotecaCatolicaBoxList = (async (crn) => {
const sourceUrl = `https://ssb.ua.edu/pls/PROD/ua_bwckschd.p_disp_detail_sched?term_in=202340&crn_in=43971`;
let siteText = await fetch(sourceUrl);
const $ = cheerio.load(await siteText.text());
const $numclass = $("table.datadisplaytable:nth-child(16) > tbody:nth-child(2) > tr:nth-child(2) > td:nth-child(4)");
console.log($numclass);
})();

Extract all text content from a URL using Cheerio

Readme
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import { fetch } from "https://esm.town/v/std/fetch";
import * as cheerio from "npm:cheerio";
export async function extractAllContent(request: Request) {
try {
const body = await request.json();
const url = body.url;
if (url) {
// Use std/fetch for proxied request
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
return Response.json($.text());
} else {
return Response.json({ message: "URL not found" }, {
status: 400,
});
}
} catch (error) {
console.error("Error:", error);
return Response.json({ message: "The body of this request was not JSON-encoded." }, {
status: 400,
});
}
}
1
2
3
4
5
6
7
8
9
10
11
12
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { load } from "npm:cheerio";
export async function latLngOfCity(args: { cityName: string }) {
const { cityName } = args;
const html = await fetchText(
`https://en.wikipedia.org/wiki/${cityName}`,
);
const $ = load(html);
const lat_lng = $("span.geo-default > span").first().text();
return lat_lng;
}
1
2
3
4
5
6
7
8
9
10
11
12
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { load } from "npm:cheerio";
export async function latLngOfCity(args: { cityName: string }) {
const { cityName } = args;
const html = await fetchText(
`https://en.wikipedia.org/wiki/${cityName}`,
);
const $ = load(html);
const lat_lng = $("span.geo-default > span").first().text();
return lat_lng;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import axios from "npm:axios";
import * as cheerio from "npm:cheerio";
export async function extractOpenGraphTags(request: Request) {
try {
const body = await request.json();
const url = body.url;
if (url) {
const response = await axios.get(url);
const html = response.data;
const $ = cheerio.load(html);
const openGraphTags = [];
$("meta[property^='og:']").each((i, el) => {
const property = $(el).attr("property");
const content = $(el).attr("content");
if (property && content) {
openGraphTags.push({ property, content });
}
});
return Response.json(openGraphTags);
} else {
return Response.json({ message: "URL not found" }, {
status: 400,
});
}
} catch (error) {
console.error("Error:", error);
return Response.json({ message: "The body of this request was not JSON-encoded." }, {
status: 400,
});
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import cheerio from "npm:cheerio";
const NEWSLETTER_URL = "https://bytes.dev/archives";
function normalizeURL(url: string) {
return url.startsWith("http://") || url.startsWith("https://")
? url
: "http://" + url;
}
async function fetchText(url: string, options?: any) {
const response = await fetch(normalizeURL(url), {
redirect: "follow",
...(options || {}),
});
return response.text();
}
export const webScrapeBytesNewsletter = async () => {
const html = await fetchText(NEWSLETTER_URL);
const $ = cheerio.load(html);
const latestIssueSection = $("main > :nth-child(2)");
const title = latestIssueSection
.find("a")
.children()
.eq(1)
.find("h3")
.children()
.eq(1)
.text();
const articleNumber = latestIssueSection
.find("a")
.children()
.eq(1)
.find("h3")
.children()
.eq(0)
.text();
const date = latestIssueSection
.find("a")
.children()
.eq(1)
.find("div > div > span")
.text();
return {
id: Number(articleNumber.split(" ")[1]),
title,
date,
};
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import { email } from "https://esm.town/v/std/email?v=9";
import { fetch } from "https://esm.town/v/std/fetch";
export async function exchangeRate() {
const cheerio = await import("npm:cheerio");
const page = await fetch(
"https://kur.doviz.com/serbest-piyasa/amerikan-dolari",
).then((response) => response.text());
const $ = cheerio.load(page);
const table = $(".value-table");
await email({
html: table.html(),
subject: "TRY/USD exchange rate alert from val.town",
});
console.log("email sent!");
}
1
2
3
4
5
6
7
8
9
10
11
12
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=5";
export const webscrapeWikipediaIntro = (async () => {
const cheerio = await import("npm:cheerio");
const html = await fetchText(
"https://en.wikipedia.org/wiki/OpenAI",
);
const $ = cheerio.load(html);
// Cheerio accepts a CSS selector, here we pick the second <p>
const intro = $("p:nth-of-type(2)").first().text();
return intro;
})();
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import { fetch } from "https://esm.town/v/std/fetch";
export default async function(request: Request): Promise<Response> {
const cheerio = await import("https://esm.sh/cheerio@1.0.0-rc.12");
const response = await fetch("https://releases.1password.com/linux/beta/");
const body = await response.text();
const $ = cheerio.load(body);
const heading = $(".c-page-details div").first().text();
const matchVersion = /^Updated to ([\d.-]+)/.exec(heading);
if (!matchVersion) {
throw new Error("Could not extract version from page");
}
const version = matchVersion[1];
return new Response(JSON.stringify({ version }));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import { fetch } from "https://esm.town/v/std/fetch";
export default async function(request: Request): Promise<Response> {
const url = new URL(request.url);
const query = url.searchParams.get("query") ?? "";
const cheerio = await import("https://esm.sh/cheerio@1.0.0-rc.12");
const response = await fetch(
"https://launchpad.net/~git-core/+archive/ubuntu/ppa/+packages",
);
const body = await response.text();
const $ = cheerio.load(body);
const rowText = [...$(".archive_package_row a")].map((el) => {
return $(el).text();
}).find((row) => {
return !query || row.includes(query);
});
const matchVersion = /^\s*git\s*-\s*(.+)/.exec(rowText);
if (!matchVersion) {
throw new Error(
`Could not extract version from page. Trying to parse "${rowText}"`,
);
}
const version = matchVersion[1];
return new Response(JSON.stringify({ query, version }));
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import dayjs from "npm:dayjs";
// fetch("https://portal.permit.pcta.org/availability/mexican-border.php").then(resp => console.log(resp.body));
// import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
// import { load } from "npm:cheerio";
// const html = await fetchText(
// "https://portal.permit.pcta.org/availability/mexican-border.php",
// );
// const $ = load(html);
// console.log($.data);
var data = {
limit: 50,
calendar: [
{ start_date: "2024-04-18", num: "50" },
{ start_date: "2024-04-17", num: "50" },
{ start_date: "2024-04-15", num: "50" },
{ start_date: "2024-04-13", num: "50" },
{ start_date: "2024-04-22", num: "50" },
{ start_date: "2024-04-16", num: "50" },
{ start_date: "2024-04-10", num: "50" },
{ start_date: "2024-04-05", num: "50" },
{ start_date: "2024-04-19", num: "50" },
{ start_date: "2024-05-10", num: "50" },
{ start_date: "2024-03-01", num: "48" },
{ start_date: "2024-04-06", num: "50" },
{ start_date: "2024-05-01", num: "50" },
{ start_date: "2024-03-27", num: "49" },
{ start_date: "2024-04-11", num: "50" },
{ start_date: "2024-03-24", num: "46" },
{ start_date: "2024-05-15", num: "50" },
{ start_date: "2024-03-29", num: "50" },
{ start_date: "2024-04-12", num: "50" },
{ start_date: "2024-04-20", num: "50" },
{ start_date: "2024-05-04", num: "50" },
{ start_date: "2024-04-09", num: "50" },
{ start_date: "2024-04-01", num: "50" },
{ start_date: "2024-04-02", num: "50" },
{ start_date: "2024-03-20", num: "47" },
{ start_date: "2024-03-26", num: "48" },
{ start_date: "2024-04-08", num: "50" },
{ start_date: "2024-03-15", num: "49" },
{ start_date: "2024-04-26", num: "50" },
{ start_date: "2024-03-12", num: "48" },
{ start_date: "2024-04-23", num: "50" },
{ start_date: "2024-05-31", num: "50" },
{ start_date: "2024-05-16", num: "50" },
{ start_date: "2024-04-21", num: "50" },
{ start_date: "2024-03-13", num: "46" },
{ start_date: "2024-05-21", num: "50" },
{ start_date: "2024-04-04", num: "50" },
{ start_date: "2024-04-03", num: "50" },
{ start_date: "2024-05-30", num: "50" },
{ start_date: "2024-05-06", num: "50" },
{ start_date: "2024-03-21", num: "49" },
{ start_date: "2024-04-28", num: "50" },
{ start_date: "2024-03-17", num: "46" },
{ start_date: "2024-03-30", num: "50" },
{ start_date: "2024-09-10", num: "1" },
{ start_date: "2024-05-05", num: "50" },
{ start_date: "2024-04-14", num: "50" },
{ start_date: "2024-03-10", num: "47" },
{ start_date: "2024-05-18", num: "50" },
{ start_date: "2024-05-25", num: "50" },
{ start_date: "2024-03-28", num: "48" },
{ start_date: "2024-04-07", num: "50" },
{ start_date: "2024-03-03", num: "45" },
{ start_date: "2024-05-02", num: "50" },
{ start_date: "2024-05-22", num: "50" },
{ start_date: "2024-04-27", num: "50" },
{ start_date: "2024-04-24", num: "50" },
{ start_date: "2024-04-29", num: "50" },
{ start_date: "2024-03-31", num: "50" },
{ start_date: "2024-05-27", num: "50" },
{ start_date: "2024-04-30", num: "50" },
{ start_date: "2024-05-07", num: "50" },
{ start_date: "2024-04-25", num: "50" },
{ start_date: "2024-03-18", num: "49" },
{ start_date: "2024-03-14", num: "49" },
{ start_date: "2024-05-20", num: "50" },
{ start_date: "2024-05-08", num: "50" },
{ start_date: "2024-05-03", num: "50" },
{ start_date: "2024-05-14", num: "50" },
{ start_date: "2024-05-28", num: "50" },
{ start_date: "2024-03-19", num: "49" },
{ start_date: "2024-03-25", num: "46" },
{ start_date: "2024-05-29", num: "50" },
{ start_date: "2024-05-13", num: "50" },
{ start_date: "2024-03-06", num: "47" },
{ start_date: "2024-05-26", num: "50" },
{ start_date: "2024-05-24", num: "50" },
{ start_date: "2024-03-22", num: "49" },
{ start_date: "2024-03-02", num: "48" },
{ start_date: "2024-03-08", num: "49" },
{ start_date: "2024-03-05", num: "47" },
{ start_date: "2024-05-11", num: "50" },
{ start_date: "2024-03-16", num: "48" },
{ start_date: "2024-03-23", num: "48" },
{ start_date: "2024-03-04", num: "48" },
{ start_date: "2024-05-09", num: "50" },
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import cheerio from "npm:cheerio";
import { Feed, FeedOptions, Item as FeedItem } from "npm:feed";
export async function flydotioRSS(req: Request): Promise<Response> {
const response = await fetch("https://fly.io/blog/");
const body = await response.text();
const $ = cheerio.load(body);
const feed = new Feed({
title: "Fly.io Blog",
description: "This is my personal feed!",
id: "https://fly.io/blog/",
link: "https://fly.io/blog/",
image: "https://fly.io/static/images/favicon/favicon-32x32.png",
favicon: "https://fly.io/static/images/favicon/favicon-32x32.png",
} as FeedOptions);
$("h1").toArray().flatMap(el =>
feed.addItem({
title: cheerio.text(cheerio(el)).trim(),
description: cheerio("p", el.parentNode).text().trim(),
link: `https://fly.io${cheerio("a", el.parentNode).attr("href")}`,
} as FeedItem)
);
console.log(feed.rss2());
// return Response.json({ ok: true });
return new Response(feed.rss2(), { headers: { "Content-Type": "application/xml" } });
}

fetch the contents of the Wikipedia "On this day in history" page. defaults to JSON output, but specify ?format=textor ?format=html for other outputs. e.g.

#wikipedia

Readme
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { load } from "npm:cheerio";
const fetchToday = async () => {
const html = await fetchText(
"https://en.wikipedia.org/wiki/Wikipedia:On_this_day/Today",
);
const $ = load(html);
// Cheerio accepts a CSS selector, here we pick the second <p>
// const intro = $("p:nth-of-type(2)").first().text();
const body = $("#mw-content-text").first().text();
// TODO trim body a bit; what is all this other stuff?
let parsedBody = body.split(".mw-parser-output")[0];
parsedBody = parsedBody.split("\n").slice(1, -1).join("\n").trim();
return parsedBody;
};
export const wikipediaToday = async (req: Request) => {
const searchParams = new URL(req.url).searchParams;
const format = searchParams.get("format") ?? "html";
const data = await fetchToday();
if (format == "json") {
return Response.json({ data });
} else if (format == "html" || format == "text") {
// return new Response(`<pre>${JSON.stringify(data, null, 2)}</pre>`, { headers: { 'Content-Type': 'text/html' } });
return new Response(data, { headers: { "Content-Type": "text/html" } });
} else {
throw new Error("unsupported format");
}
};

RSS feed of stevekrouse.com essays

Readme
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import cheerio from "npm:cheerio";
import { Feed } from "npm:feed";
export const stevekrouseRSS = async () => {
const response = await fetch("https://stevekrouse.com/");
const body = await response.text();
const $ = cheerio.load(body);
/*
<li><span class="date">2022 Nov 08 - </span><a href="https://val-town.notion.site/End-programmer-Programming-a749beb4a9b143f2990f575fb7e59b33">End-programmer Programming</a></li>
<li><span class="date">2019 Apr 17 - </span><a href="https://futureofcoding.org/notes/alan-kay-lunch">Lunch with Alan Kay: how to become educated enough to invent the future</a></li>
*/
// parse this into a JSON array with date, link, title
const parsedItems = $("#page > div:nth-child(4) > ul > li").map((_, el) => {
const date = $(el).find(".date").text().trim().replace(" -", "");
const link = $(el).find("a").attr("href");
const title = $(el).find("a").text();
return { date, link, title };
}).get();
console.log(JSON.stringify(parsedItems));
// return this as an RSS feed
const feed = new Feed({
title: "Steve Krouse's Blog",
description: "RSS Feed for Steve Krouse's Blog",
id: "https://stevekrouse.com/",
link: "https://stevekrouse.com/",
language: "en",
updated: new Date(parsedItems[0]?.date),
generator: "Cheerio & Feed for TypeScript",
});
parsedItems.forEach((item) => {
feed.addItem({
title: item.title,
id: item.link,
link: item.link,
description: item.title,
content: item.title,
date: new Date(item.date),
});
});
return new Response(feed.rss2(), {
headers: {
"Content-Type": "application/rss+xml",
},
});
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import process from "node:process";
// fork by https://esm.town/v/webup/getWebLoaderBuilder;
export async function getWebLoaderBuilder(
url: string,
type: "webpage" | "github" | "audio" = "webpage",
options?: any,
) {
const { cond, matches } = await import("npm:lodash-es");
const setup = cond([
[
matches({ type: "webpage" }),
async () => {
await import("npm:cheerio");
const { CheerioWebBaseLoader } = await import(
"npm:langchain/document_loaders/web/cheerio"
);
return new CheerioWebBaseLoader(url);
},
],
[
matches({ type: "github" }),
async () => {
await import("npm:ignore");
const { GithubRepoLoader } = await import(
"npm:langchain/document_loaders/web/github"
);
return new GithubRepoLoader(url, options);
},
],
[
matches({ type: "audio" }),
async () => {
const { AudioTranscriptLoader } = await import(
"npm:langchain/document_loaders/web/assemblyai"
);
return new AudioTranscriptLoader({
audio_url: url,
...options,
}, {
apiKey: process.env.ASSEMBLYAI,
});
},
],
]);
return () => setup({ type });
}