Readme

This is a deno/valtown port in progress of https://github.com/tarasglek/scrape2md

License: MIT

Handy script to scrape various data sources into markdown. Intended to feed llms in https://chatcraft.org

Usage: https://taras-scrape2md.web.val.run/ + URL_TO_SCRAPE

Or just visit in browser and paste your url

TODO

https://chatcraft.org/api/share/tarasglek/IDYChVAilfePgVZb_T5pH POST from browser https://www.val.town/v/nbbaier/valToGH sync to github

Metadata for use with https://github.com/tarasglek/valtown2js:

{
  "typeCheck": false,
  "mappings": {
    "https://esm.sh/linkedom": {
      "name": "linkedom",
      "version": "^0.16.8"
    }
  },
  "package": {
    "name": "scrape2md",
    "version": "1.0.0",
    "devDependencies": {
      "@types/turndown": "^5.0.4"
    }
  }
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { isProbablyReaderable, Readability } from "npm:@mozilla/readability@^0.5.0";
import { DOMParser } from "npm:linkedom@0.16.10";
import { marked } from "npm:marked@12.0.1";
import TurndownService from "npm:turndown@^7.1.3";
import { getSubtitles } from "npm:youtube-captions-scraper@^2.0.1";
function getYoutubeVideoID(url: URL): string | null {
const regExp = /(?:youtube\.com\/(?:[^/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?/\s]{11})/i;
const match = url.href.match(regExp);
return match ? match[1] : null;
}
function response(message: string, contentType = "text/markdown"): Response {
const headers = new Headers();
headers.set("Access-Control-Allow-Origin", "*");
headers.set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS");
headers.set("Access-Control-Allow-Headers", "Content-Type, Authorization");
headers.set("Access-Control-Max-Age", "86400");
headers.set("Content-Type", contentType);
return new Response(message, {
status: 200,
headers: headers,
});
}
function err(msg: string): Response {
const errorMessage = JSON.stringify({
error: {
message: msg,
code: 400,
},
});
return response(errorMessage, "application/json");
}
export default async function(req: Request): Promise<Response> {
const myurl = new URL(req.url);
let pathname = myurl.pathname.substring(1) + myurl.search;
if (!pathname.startsWith("http")) {
const urlAsFormParam = myurl.searchParams.get("url");
if (urlAsFormParam) {
pathname = urlAsFormParam;
} else {
return response(html, "text/html");
}
}
const url = new URL(pathname);
const youtubeVideoID = getYoutubeVideoID(url);
if (youtubeVideoID) {
const arr = (await getSubtitles({
videoID: youtubeVideoID,
})) as { text: string }[];
const description = "## Generated Transcription\n\n"
+ arr.map(({ text }) => text).join("\n");
return response(description);
}
const dom_promise = fetch(url.toString(), {
method: req.method,
headers: new Headers({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": `"macOS"`,
"Upgrade-Insecure-Requests": "1",
// Add any other headers you need here
}),
})
.then(r => r.text())
.then(async html => new DOMParser().parseFromString(html, "text/html"));
const doc = await dom_promise;
const reader = new Readability(doc);
const article = reader.parse();
console.log("content", typeof article?.content, article?.content);
const markdown = new TurndownService().turndown(article?.content || "") + "\n\n" + url;
if (req.headers.get("Accept")?.includes("text/html")) {
return response(await marked.parse(markdown), "text/html");
} else {
return response(markdown);
}
}
const html = `
<!DOCTYPE html>
<html>
<head>
<title>scrape2md ui</title>
<!-- Tailwind CSS -->
👆 This is a val. Vals are TypeScript snippets of code, written in the browser and run on our servers. Create scheduled functions, email yourself, and persist small pieces of data — all from the browser.