Back to packages list

Vals using @mozilla/readability

Description from the NPM package:
A standalone version of the readability library used for Firefox Reader View.

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

https://jsr.io/@tarasglek/markdown-download

Features

  • Apply readability
  • Further convert article into markdown to simplify it
  • Allow webpages to be viewable as markdown via curl
  • Serve markdown converted to html to browsers
  • Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";
import { DOMParser } from "npm:linkedom@0.16.10";
import { marked } from "npm:marked@12.0.1";
import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";
const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";
// init async loading of modules
const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;
const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");
async function markdown2html(html: string): Promise<string> {
if (AgentMarkdownImport) {
// TurndownService doesn't work on cf
// Dynamically import AgentMarkdown when running in Cloudflare Worker
const { AgentMarkdown } = await AgentMarkdownImport;
return await AgentMarkdown.produce(html);
} else {
// Dynamically import TurndownService otherwise
return new (await TurndownService)().turndown(html);
}
}
function getYoutubeVideoID(url: URL): string | null {
const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;
const match = url.href.match(regExp);
return match ? match[1] : null;
}
function response(message: string, contentType = "text/markdown"): Response {
const headers = new Headers();
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");
headers.set("Access-Control-Max-Age", "86400");
headers.set("Content-Type", contentType);
return new Response(message, {
status: 200,
headers: headers,
});
}
function err(msg: string): Response {
const errorMessage = JSON.stringify({
error: {
message: msg,
code: 400,
},
});
return response(errorMessage, "application/json");
}
function fudgeURL(url: string) {
try {
return new URL(url);
} catch (e) {
// console.log("Url parsing failed", e.stack);
return new URL("https://" + url);
}
}
function processInput(req: Request) {
let ret = {
url: undefined as undefined | URL,
response: undefined as undefined | Response,
};
const myurl = new URL(req.url);
let pathname = myurl.pathname.substring(1) + myurl.search;
if (!pathname.startsWith("http")) {
const urlAsFormParam = myurl.searchParams.get("url");
if (urlAsFormParam) {
pathname = urlAsFormParam;
} else if (pathname.length < 2) {
ret.response = response(
generate_ui(
"URL to convert to markdown:",
"https://www.val.town/v/taras/markdown_download",
"markdown.download",
),
"text/html",
);
return ret;
}
}
ret.url = fudgeURL(pathname);
return ret;
}
export default async function(req: Request): Promise<Response> {
const action = processInput(req);
const url = action.url;
if (!url) {
return action.response!;
}
const youtubeVideoID = getYoutubeVideoID(url);
if (youtubeVideoID) {
const arr = (await getSubtitles({
videoID: youtubeVideoID,
})) as { text: string }[];
const description = "## Generated Transcription\n\n"

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

https://jsr.io/@tarasglek/markdown-download

Features

  • Apply readability
  • Further convert article into markdown to simplify it
  • Allow webpages to be viewable as markdown via curl
  • Serve markdown converted to html to browsers
  • Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";
import { DOMParser } from "npm:linkedom@0.16.10";
import { marked } from "npm:marked@12.0.1";
import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";
const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";
// init async loading of modules
const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;
const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");
async function markdown2html(html: string): Promise<string> {
if (AgentMarkdownImport) {
// TurndownService doesn't work on cf
// Dynamically import AgentMarkdown when running in Cloudflare Worker
const { AgentMarkdown } = await AgentMarkdownImport;
return await AgentMarkdown.produce(html);
} else {
// Dynamically import TurndownService otherwise
return new (await TurndownService)().turndown(html);
}
}
function getYoutubeVideoID(url: URL): string | null {
const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;
const match = url.href.match(regExp);
return match ? match[1] : null;
}
function response(message: string, contentType = "text/markdown"): Response {
const headers = new Headers();
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");
headers.set("Access-Control-Max-Age", "86400");
headers.set("Content-Type", contentType);
return new Response(message, {
status: 200,
headers: headers,
});
}
function err(msg: string): Response {
const errorMessage = JSON.stringify({
error: {
message: msg,
code: 400,
},
});
return response(errorMessage, "application/json");
}
function fudgeURL(url: string) {
try {
return new URL(url);
} catch (e) {
// console.log("Url parsing failed", e.stack);
return new URL("https://" + url);
}
}
function processInput(req: Request) {
let ret = {
url: undefined as undefined | URL,
response: undefined as undefined | Response,
};
const myurl = new URL(req.url);
let pathname = myurl.pathname.substring(1) + myurl.search;
if (!pathname.startsWith("http")) {
const urlAsFormParam = myurl.searchParams.get("url");
if (urlAsFormParam) {
pathname = urlAsFormParam;
} else if (pathname.length < 2) {
ret.response = response(
generate_ui(
"URL to convert to markdown:",
"https://www.val.town/v/taras/markdown_download",
"markdown.download",
),
"text/html",
);
return ret;
}
}
ret.url = fudgeURL(pathname);
return ret;
}
export default async function(req: Request): Promise<Response> {
const action = processInput(req);
const url = action.url;
if (!url) {
return action.response!;
}
const youtubeVideoID = getYoutubeVideoID(url);
if (youtubeVideoID) {
const arr = (await getSubtitles({
videoID: youtubeVideoID,
})) as { text: string }[];
const description = "## Generated Transcription\n\n"

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

Introductory blog post: https://taras.glek.net/post/markdown.download/

Package: https://jsr.io/@tarasglek/markdown-download

Features

  • Apply readability
  • Further convert article into markdown to simplify it
  • Allow webpages to be viewable as markdown via curl
  • Serve markdown converted to html to browsers
  • Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";
import { DOMParser } from "npm:linkedom@0.16.10";
import { marked } from "npm:marked@12.0.1";
import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";
import { YouTube } from "npm:youtube-sr@4.3.11";
const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";
// init async loading of modules
const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;
const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");
/**
* converts HTML to markdown
* @returns markdown in string
*/
export async function html2markdown(html: string): Promise<string> {
if (AgentMarkdownImport) {
// TurndownService doesn't work on cf
// Dynamically import AgentMarkdown when running in Cloudflare Worker
const { AgentMarkdown } = await AgentMarkdownImport;
return await AgentMarkdown.produce(html);
} else {
// Dynamically import TurndownService otherwise
return new (await TurndownService)().turndown(html);
}
}
/**
* extracts article from html
* then converts it to md
* @returns markdown in string
*/
export async function readability2markdown(html: string): Promise<{ title: string; markdown: string }> {
const doc = await (new DOMParser().parseFromString(html, "text/html"));
const reader = new Readability(doc);
const article = reader.parse();
const markdown = await html2markdown(article?.content || "");
return { title: doc.title.textContent, markdown };
}
function getYoutubeVideoID(url: URL): string | null {
const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;
const match = url.href.match(regExp);
return match ? match[1] : null;
}
function response(message: string, contentType = "text/markdown"): Response {
const headers = new Headers();
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");
headers.set("Access-Control-Max-Age", "86400");
headers.set("Content-Type", contentType);
return new Response(message, {
status: 200,
headers: headers,
});
}
function err(msg: string): Response {
const errorMessage = JSON.stringify({
error: {
message: msg,
code: 400,
},
});
return response(errorMessage, "application/json");
}
function fudgeURL(url: string) {
try {
return new URL(url);
} catch (e) {
// console.log("Url parsing failed", e.stack);
return new URL("https://" + url);
}
}
function processInput(req: Request) {
let ret = {
url: undefined as undefined | URL,
response: undefined as undefined | Response,
};
const myurl = new URL(req.url);
let pathname = myurl.pathname.substring(1) + myurl.search;
if (!pathname.startsWith("http")) {
const urlAsFormParam = myurl.searchParams.get("url");
if (urlAsFormParam) {
pathname = urlAsFormParam;
} else if (pathname.length < 2) {
ret.response = response(
generate_ui(
"URL to convert to markdown:",
"https://www.val.town/v/taras/markdown_download",
"markdown.download",
),
"text/html",
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { resetStyle } from "https://esm.town/v/nbbaier/resetStyle";
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { html } from "https://esm.town/v/stevekrouse/html?v=5";
import { Readability } from "npm:@mozilla/readability";
// @ts-expect-error
import jsdom from "npm:jsdom";
type Article = {
title: string;
content: string;
textContent: string;
length: number;
excerpt: string;
byline: string;
dir: string;
siteName: string;
lang: string;
publishedTime: string;
};
const pageShell = (title: string, pageContent: string) => {
return (`<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Source+Sans+3:ital,wght@0,200..900;1,200..900&display=swap" rel="stylesheet">
<title>${title}</title>
<style>
${resetStyle}
html {
font-family: "Source Sans 3", system-ui, sans-serif;
padding: 0.75rem;
margin: auto;
max-width: 90ch;
}
.title-container {
display: flex;
flex-direction: column;
margin-block-end: 0.25rem
}
.subhead {
margin-block-start: 0.5rem;
color: gray;
}
</style>
</head>
<body>
${pageContent}
</body>
</html>
`);
};
export default async function(req: Request): Promise<Response> {
const JSDOM = jsdom.JSDOM;
const url = new URL(req.url);
const articleUrl = url.pathname.substring(1);
if (articleUrl === "") {
return html(pageShell(
"VT Reader",
`
<h1>Val Town Reader</h1>
<p>Enter a url below to get a reader view using <a href="https://github.com/mozilla/readability">Readability.js</a></p>
<form style="display: flex; gap: .25rem; margin-block-start:1rem;" id="nameForm">
<input type="text" id="name" name="name" placeholder="Enter a url" required>
<br>
<input type="submit" value="Submit">
</form>
<script>
document.getElementById('nameForm').onsubmit = function(event) {
event.preventDefault();
const targetURL = document.getElementById("name").value
const newURL = "${url.origin}" + "/" + targetURL
window.location = newURL
};
</script>
</body> `,
));
}
let body = await fetchText(articleUrl);
let doc = new JSDOM(body);
let reader = new Readability(doc.window.document);
let article = reader.parse();
let title = article.title;
return html(pageShell(
title,

Readable

Make any website instantly readable!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import { Readability } from "npm:@mozilla/readability"; // @ts-ignore
import { JSDOM } from "npm:jsdom";
export default async function(req: Request) {
let resp = await fetch(`https://${new URL(req.url).pathname}`, req);
let body = await resp.text();
let doc = new JSDOM(body);
let reader = new Readability(doc.window.document.cloneNode(true));
let article = reader.parse();
console.log(article.content);
return new Response(
`
<html>
<head>
<link
rel="stylesheet"
href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"
/>
</head>
<body>
<main>
${article.content}
</main>
</body>
</html>
`,
{
headers: {
"Content-Type": "text/html",
},
},
);
}

This is a deno/valtown port in progress of https://github.com/tarasglek/scrape2md

License: MIT

Handy script to scrape various data sources into markdown. Intended to feed llms in https://chatcraft.org

Usage: https://taras-scrape2md.web.val.run/ + URL_TO_SCRAPE

Or just visit in browser and paste your url

TODO

https://chatcraft.org/api/share/tarasglek/IDYChVAilfePgVZb_T5pH POST from browser https://www.val.town/v/nbbaier/valToGH sync to github

Metadata for use with https://github.com/tarasglek/valtown2js:

{
  "typeCheck": false,
  "mappings": {
    "https://esm.sh/linkedom": {
      "name": "linkedom",
      "version": "^0.16.8"
    }
  },
  "package": {
    "name": "scrape2md",
    "version": "1.0.0",
    "devDependencies": {
      "@types/turndown": "^5.0.4"
    }
  }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";
import { DOMParser } from "npm:linkedom@0.16.10";
import { marked } from "npm:marked@12.0.1";
import TurndownService from "npm:turndown@^7.1.3";
import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";
function getYoutubeVideoID(url: URL): string | null {
const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;
const match = url.href.match(regExp);
return match ? match[1] : null;
}
function response(message: string, contentType = "text/markdown"): Response {
const headers = new Headers();
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");
headers.set("Access-Control-Max-Age", "86400");
headers.set("Content-Type", contentType);
return new Response(message, {
status: 200,
headers: headers,
});
}
function err(msg: string): Response {
const errorMessage = JSON.stringify({
error: {
message: msg,
code: 400,
},
});
return response(errorMessage, "application/json");
}
export default async function(req: Request): Promise<Response> {
const myurl = new URL(req.url);
let pathname = myurl.pathname.substring(1) + myurl.search;
if (!pathname.startsWith("http")) {
const urlAsFormParam = myurl.searchParams.get("url");
if (urlAsFormParam) {
pathname = urlAsFormParam;
} else {
return response(html, "text/html");
}
}
const url = new URL(pathname);
const youtubeVideoID = getYoutubeVideoID(url);
if (youtubeVideoID) {
const arr = (await getSubtitles({
videoID: youtubeVideoID,
})) as { text: string }[];
const description = "## Generated Transcription\n\n"
+ arr.map(({ text }) => text).join("\n");
return response(description);
}
const dom_promise = fetch(url.toString(), {
method: req.method,
headers: new Headers({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": `"macOS"`,
"Upgrade-Insecure-Requests": "1",
// Add any other headers you need here
}),
})
.then(r => r.text())
.then(async html => new DOMParser().parseFromString(html, "text/html"));
const doc = await dom_promise;
const reader = new Readability(doc);
const article = reader.parse();
console.log("content", typeof article?.content, article?.content);
const markdown = new TurndownService().turndown(article?.content || "") + "\n\n" + url;
if (req.headers.get("Accept")?.includes("text/html")) {
return response(await marked.parse(markdown), "text/html");
} else {
return response(markdown);
}
}
const html = `
<!DOCTYPE html>
<html>
<head>
<title>scrape2md ui</title>
<!-- Tailwind CSS -->
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import { DOMParser } from "https://deno.land/x/deno_dom/deno-dom-wasm.ts";
import { Readability } from "npm:@mozilla/readability";
// inspired by https://www.val.town/v/stevekrouse/fetchJSON
interface Readable {
title: string;
content: string;
textContent: string;
length: number;
excerpt: string;
byline: string;
dir: string;
siteName: string;
lang: string;
publishedTime: string;
// url is final url after redirects
url: string;
}
export const fetchReadable = async (url: string): Promise<Readable> => {
const r = await fetch(url, { headers: { "content-type": "text/html" }, redirect: "follow" });
try {
const text = await r.text();
const parser = new DOMParser();
const node = parser.parseFromString(text, "text/html");
const relative = node.createElement("base");
relative.href = url;
node.head.appendChild(relative);
let data = new Readability(node).parse();
data.url = r.url;
return data;
} catch (err) {
throw new Error(`fetchReadable error: ${err.message} in ${url}`);
}
};
1
2
3
4
5
6
7
8
9
export const simplifyHTML = async (params) => {
const { DOMParser } = await import(
"https://deno.land/x/deno_dom/deno-dom-wasm.ts"
);
const { Readability } = await import("npm:@mozilla/readability");
const doc = new DOMParser().parseFromString(params.html, "text/html");
const reader = new Readability(doc);
return reader.parse();
};
1
Next