Vals using @mozilla/readability

Description from the NPM package:

A standalone version of the readability library used for Firefox Reader View.

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

https://jsr.io/@tarasglek/markdown-download

Features

Apply readability
Further convert article into markdown to simplify it
Allow webpages to be viewable as markdown via curl
Serve markdown converted to html to browsers
Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

markdown_download

100

import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";

import { DOMParser } from "npm:linkedom@0.16.10";

import { marked } from "npm:marked@12.0.1";

import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";

const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";

// init async loading of modules

const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;

const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");

async function markdown2html(html: string): Promise<string> {

if (AgentMarkdownImport) {

// TurndownService doesn't work on cf

// Dynamically import AgentMarkdown when running in Cloudflare Worker

const { AgentMarkdown } = await AgentMarkdownImport;

return await AgentMarkdown.produce(html);

} else {

// Dynamically import TurndownService otherwise

return new (await TurndownService)().turndown(html);

}

function getYoutubeVideoID(url: URL): string | null {

const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;

const match = url.href.match(regExp);

return match ? match[1] : null;

}

function response(message: string, contentType = "text/markdown"): Response {

const headers = new Headers();

headers.set("Access-Control-Allow-Origin", "*");

headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");

headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");

headers.set("Access-Control-Max-Age", "86400");

headers.set("Content-Type", contentType);

return new Response(message, {

status: 200,

headers: headers,

});

}

function err(msg: string): Response {

const errorMessage = JSON.stringify({

error: {

message: msg,

code: 400,

});

return response(errorMessage, "application/json");

}

function fudgeURL(url: string) {

try {

return new URL(url);

} catch (e) {

// console.log("Url parsing failed", e.stack);

return new URL("https://" + url);

}

function processInput(req: Request) {

let ret = {

url: undefined as undefined | URL,

response: undefined as undefined | Response,

};

const myurl = new URL(req.url);

let pathname = myurl.pathname.substring(1) + myurl.search;

if (!pathname.startsWith("http")) {

const urlAsFormParam = myurl.searchParams.get("url");

if (urlAsFormParam) {

pathname = urlAsFormParam;

} else if (pathname.length < 2) {

ret.response = response(

generate_ui(

"URL to convert to markdown:",

"https://www.val.town/v/taras/markdown_download",

"markdown.download",

"text/html",

);

return ret;

}

ret.url = fudgeURL(pathname);

return ret;

}

export default async function(req: Request): Promise<Response> {

const action = processInput(req);

const url = action.url;

if (!url) {

return action.response!;

}

const youtubeVideoID = getYoutubeVideoID(url);

if (youtubeVideoID) {

const arr = (await getSubtitles({

videoID: youtubeVideoID,

})) as { text: string }[];

const description = "## Generated Transcription\n\n"

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

https://jsr.io/@tarasglek/markdown-download

Features

Apply readability
Further convert article into markdown to simplify it
Allow webpages to be viewable as markdown via curl
Serve markdown converted to html to browsers
Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

yellowHawk

100

import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";

import { DOMParser } from "npm:linkedom@0.16.10";

import { marked } from "npm:marked@12.0.1";

import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";

const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";

// init async loading of modules

const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;

const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");

async function markdown2html(html: string): Promise<string> {

if (AgentMarkdownImport) {

// TurndownService doesn't work on cf

// Dynamically import AgentMarkdown when running in Cloudflare Worker

const { AgentMarkdown } = await AgentMarkdownImport;

return await AgentMarkdown.produce(html);

} else {

// Dynamically import TurndownService otherwise

return new (await TurndownService)().turndown(html);

}

function getYoutubeVideoID(url: URL): string | null {

const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;

const match = url.href.match(regExp);

return match ? match[1] : null;

}

function response(message: string, contentType = "text/markdown"): Response {

const headers = new Headers();

headers.set("Access-Control-Allow-Origin", "*");

headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");

headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");

headers.set("Access-Control-Max-Age", "86400");

headers.set("Content-Type", contentType);

return new Response(message, {

status: 200,

headers: headers,

});

}

function err(msg: string): Response {

const errorMessage = JSON.stringify({

error: {

message: msg,

code: 400,

});

return response(errorMessage, "application/json");

}

function fudgeURL(url: string) {

try {

return new URL(url);

} catch (e) {

// console.log("Url parsing failed", e.stack);

return new URL("https://" + url);

}

function processInput(req: Request) {

let ret = {

url: undefined as undefined | URL,

response: undefined as undefined | Response,

};

const myurl = new URL(req.url);

let pathname = myurl.pathname.substring(1) + myurl.search;

if (!pathname.startsWith("http")) {

const urlAsFormParam = myurl.searchParams.get("url");

if (urlAsFormParam) {

pathname = urlAsFormParam;

} else if (pathname.length < 2) {

ret.response = response(

generate_ui(

"URL to convert to markdown:",

"https://www.val.town/v/taras/markdown_download",

"markdown.download",

"text/html",

);

return ret;

}

ret.url = fudgeURL(pathname);

return ret;

}

export default async function(req: Request): Promise<Response> {

const action = processInput(req);

const url = action.url;

if (!url) {

return action.response!;

}

const youtubeVideoID = getYoutubeVideoID(url);

if (youtubeVideoID) {

const arr = (await getSubtitles({

videoID: youtubeVideoID,

})) as { text: string }[];

const description = "## Generated Transcription\n\n"

markdown.download

Handy microservice/library to convert various data sources into markdown. Intended to make it easier to consume the web in ereaders

Introductory blog post: https://taras.glek.net/post/markdown.download/

Package: https://jsr.io/@tarasglek/markdown-download

Features

Apply readability
Further convert article into markdown to simplify it
Allow webpages to be viewable as markdown via curl
Serve markdown converted to html to browsers
Extract youtube subtitles

Source

https://github.com/tarasglek/markdown-download

https://www.val.town/v/taras/markdown_download

License: MIT

Usage: https://markdown.download/ + URL

Dev: https://val.markdown.download/ + URL

markdown_download

100

import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";

import { DOMParser } from "npm:linkedom@0.16.10";

import { marked } from "npm:marked@12.0.1";

import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";

import { YouTube } from "npm:youtube-sr@4.3.11";

const isCloudflareWorker = typeof Request !== "undefined" && typeof Response !== "undefined";

// init async loading of modules

const AgentMarkdownImport = isCloudflareWorker ? import("npm:agentmarkdown@6.0.0") : null;

const TurndownService = isCloudflareWorker ? null : await import("npm:turndown@^7.1.3");

/**

* converts HTML to markdown

* @returns markdown in string

export async function html2markdown(html: string): Promise<string> {

if (AgentMarkdownImport) {

// TurndownService doesn't work on cf

// Dynamically import AgentMarkdown when running in Cloudflare Worker

const { AgentMarkdown } = await AgentMarkdownImport;

return await AgentMarkdown.produce(html);

} else {

// Dynamically import TurndownService otherwise

return new (await TurndownService)().turndown(html);

}

/**

* extracts article from html

* then converts it to md

* @returns markdown in string

export async function readability2markdown(html: string): Promise<{ title: string; markdown: string }> {

const doc = await (new DOMParser().parseFromString(html, "text/html"));

const reader = new Readability(doc);

const article = reader.parse();

const markdown = await html2markdown(article?.content || "");

return { title: doc.title.textContent, markdown };

}

function getYoutubeVideoID(url: URL): string | null {

const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;

const match = url.href.match(regExp);

return match ? match[1] : null;

}

function response(message: string, contentType = "text/markdown"): Response {

const headers = new Headers();

headers.set("Access-Control-Allow-Origin", "*");

headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");

headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");

headers.set("Access-Control-Max-Age", "86400");

headers.set("Content-Type", contentType);

return new Response(message, {

status: 200,

headers: headers,

});

}

function err(msg: string): Response {

const errorMessage = JSON.stringify({

error: {

message: msg,

code: 400,

});

return response(errorMessage, "application/json");

}

function fudgeURL(url: string) {

try {

return new URL(url);

} catch (e) {

// console.log("Url parsing failed", e.stack);

return new URL("https://" + url);

}

function processInput(req: Request) {

let ret = {

url: undefined as undefined | URL,

response: undefined as undefined | Response,

};

const myurl = new URL(req.url);

let pathname = myurl.pathname.substring(1) + myurl.search;

if (!pathname.startsWith("http")) {

const urlAsFormParam = myurl.searchParams.get("url");

if (urlAsFormParam) {

pathname = urlAsFormParam;

} else if (pathname.length < 2) {

ret.response = response(

generate_ui(

"URL to convert to markdown:",

"https://www.val.town/v/taras/markdown_download",

"markdown.download",

"text/html",

readabilityHTTPProxy

100

import { resetStyle } from "https://esm.town/v/nbbaier/resetStyle";

import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";

import { html } from "https://esm.town/v/stevekrouse/html?v=5";

import { Readability } from "npm:@mozilla/readability";

// @ts-expect-error

import jsdom from "npm:jsdom";

type Article = {

title: string;

content: string;

textContent: string;

length: number;

excerpt: string;

byline: string;

dir: string;

siteName: string;

lang: string;

publishedTime: string;

};

const pageShell = (title: string, pageContent: string) => {

return (`<!DOCTYPE html>

<head>

<title>${title}</title>

<style>

${resetStyle}

html {

font-family: "Source Sans 3", system-ui, sans-serif;

padding: 0.75rem;

margin: auto;

max-width: 90ch;

}

.title-container {

display: flex;

flex-direction: column;

margin-block-end: 0.25rem

}

.subhead {

margin-block-start: 0.5rem;

color: gray;

}

</style>

</head>

<body>

${pageContent}

</body>

</html>

`);

};

export default async function(req: Request): Promise<Response> {

const JSDOM = jsdom.JSDOM;

const url = new URL(req.url);

const articleUrl = url.pathname.substring(1);

if (articleUrl === "") {

return html(pageShell(

"VT Reader",

<h1>Val Town Reader</h1>

<p>Enter a url below to get a reader view using <a href="https://github.com/mozilla/readability">Readability.js</a></p>

<br>

</form>

document.getElementById('nameForm').onsubmit = function(event) {

event.preventDefault();

const targetURL = document.getElementById("name").value

const newURL = "${url.origin}" + "/" + targetURL

window.location = newURL

};

</script>

</body> `,

));

}

let body = await fetchText(articleUrl);

let doc = new JSDOM(body);

let reader = new Readability(doc.window.document);

let article = reader.parse();

let title = article.title;

return html(pageShell(

title,

Readable

Make any website instantly readable!

readable

import { Readability } from "npm:@mozilla/readability"; // @ts-ignore

import { JSDOM } from "npm:jsdom";

export default async function(req: Request) {

let resp = await fetch(`https://${new URL(req.url).pathname}`, req);

let body = await resp.text();

let doc = new JSDOM(body);

let reader = new Readability(doc.window.document.cloneNode(true));

let article = reader.parse();

console.log(article.content);

return new Response(

<html>

<head>

<link

rel="stylesheet"

href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"

</head>

<body>

<main>

${article.content}

</main>

</body>

</html>

{

headers: {

"Content-Type": "text/html",

);

}

This is a deno/valtown port in progress of https://github.com/tarasglek/scrape2md

License: MIT

Handy script to scrape various data sources into markdown. Intended to feed llms in https://chatcraft.org

Usage: https://taras-scrape2md.web.val.run/ + URL_TO_SCRAPE

Or just visit in browser and paste your url

TODO

https://chatcraft.org/api/share/tarasglek/IDYChVAilfePgVZb_T5pH POST from browser https://www.val.town/v/nbbaier/valToGH sync to github

Metadata for use with https://github.com/tarasglek/valtown2js:

{
  "typeCheck": false,
  "mappings": {
    "https://esm.sh/linkedom": {
      "name": "linkedom",
      "version": "^0.16.8"
    }
  },
  "package": {
    "name": "scrape2md",
    "version": "1.0.0",
    "devDependencies": {
      "@types/turndown": "^5.0.4"
    }
  }
}

scrape2md

100

import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";

import { DOMParser } from "npm:linkedom@0.16.10";

import { marked } from "npm:marked@12.0.1";

import TurndownService from "npm:turndown@^7.1.3";

import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";

function getYoutubeVideoID(url: URL): string | null {

const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;

const match = url.href.match(regExp);

return match ? match[1] : null;

}

function response(message: string, contentType = "text/markdown"): Response {

const headers = new Headers();

headers.set("Access-Control-Allow-Origin", "*");

headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");

headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");

headers.set("Access-Control-Max-Age", "86400");

headers.set("Content-Type", contentType);

return new Response(message, {

status: 200,

headers: headers,

});

}

function err(msg: string): Response {

const errorMessage = JSON.stringify({

error: {

message: msg,

code: 400,

});

return response(errorMessage, "application/json");

}

export default async function(req: Request): Promise<Response> {

const myurl = new URL(req.url);

let pathname = myurl.pathname.substring(1) + myurl.search;

if (!pathname.startsWith("http")) {

const urlAsFormParam = myurl.searchParams.get("url");

if (urlAsFormParam) {

pathname = urlAsFormParam;

} else {

return response(html, "text/html");

}

const url = new URL(pathname);

const youtubeVideoID = getYoutubeVideoID(url);

if (youtubeVideoID) {

const arr = (await getSubtitles({

videoID: youtubeVideoID,

})) as { text: string }[];

const description = "## Generated Transcription\n\n"

+ arr.map(({ text }) => text).join("\n");

return response(description);

}

const dom_promise = fetch(url.toString(), {

method: req.method,

headers: new Headers({

"User-Agent":

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",

"Accept-Language": "en-US,en;q=0.5",

"Sec-Fetch-Site": "cross-site",

"Sec-Fetch-Mode": "navigate",

"Sec-Fetch-User": "?1",

"Sec-Fetch-Dest": "document",

"Referer": "https://www.google.com/",

"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,

"sec-ch-ua-mobile": "?0",

"sec-ch-ua-platform": `"macOS"`,

"Upgrade-Insecure-Requests": "1",

// Add any other headers you need here

}),

})

.then(r => r.text())

.then(async html => new DOMParser().parseFromString(html, "text/html"));

const doc = await dom_promise;

const reader = new Readability(doc);

const article = reader.parse();

console.log("content", typeof article?.content, article?.content);

const markdown = new TurndownService().turndown(article?.content || "") + "\n\n" + url;

if (req.headers.get("Accept")?.includes("text/html")) {

return response(await marked.parse(markdown), "text/html");

} else {

return response(markdown);

}

const html = `

<!DOCTYPE html>

<html>

<head>

<title>scrape2md ui</title>

fetchReadable

Fork

import { DOMParser } from "https://deno.land/x/deno_dom/deno-dom-wasm.ts";

import { Readability } from "npm:@mozilla/readability";

// inspired by https://www.val.town/v/stevekrouse/fetchJSON

interface Readable {

title: string;

content: string;

textContent: string;

length: number;

excerpt: string;

byline: string;

dir: string;

siteName: string;

lang: string;

publishedTime: string;

// url is final url after redirects

url: string;

}

export const fetchReadable = async (url: string): Promise<Readable> => {

const r = await fetch(url, { headers: { "content-type": "text/html" }, redirect: "follow" });

try {

const text = await r.text();

const parser = new DOMParser();

const node = parser.parseFromString(text, "text/html");

const relative = node.createElement("base");

relative.href = url;

node.head.appendChild(relative);

let data = new Readability(node).parse();

data.url = r.url;

return data;

} catch (err) {

throw new Error(`fetchReadable error: ${err.message} in ${url}`);

}

};

simplifyHTML

Fork

export const simplifyHTML = async (params) => {

const { DOMParser } = await import(

"https://deno.land/x/deno_dom/deno-dom-wasm.ts"

);

const { Readability } = await import("npm:@mozilla/readability");

const doc = new DOMParser().parseFromString(params.html, "text/html");

const reader = new Readability(doc);

return reader.parse();

};