about summary refs log tree commit diff
path: root/fun/amsterdump/scrape.el
diff options
context:
space:
mode:
Diffstat (limited to 'fun/amsterdump/scrape.el')
-rw-r--r--fun/amsterdump/scrape.el25
1 files changed, 25 insertions, 0 deletions
diff --git a/fun/amsterdump/scrape.el b/fun/amsterdump/scrape.el
new file mode 100644
index 000000000000..f5537c2c8f17
--- /dev/null
+++ b/fun/amsterdump/scrape.el
@@ -0,0 +1,25 @@
+;; Scraping funda.nl (this file is just notes and snippets, not full code)
+;;
+;; Begin by copying whole page into buffer (out of inspect element
+;; because encoding is difficult)
+
+(beginning-of-buffer)
+
+;; zap everything that isn't a relevant result
+(keep-lines "data-object-url-tracking\\|img alt")
+
+;; mark all spans, move them to the end of the buffer
+(cl-letf (((symbol-function 'read-regexp)
+           (lambda (&rest _) "</span>")))
+  (mc/mark-all-in-region-regexp (point-min) (point-max)))
+
+;; mark all images lines (these contain street addresses for things
+;; with images), clear up and join with previous
+;;
+;; mark all: data-image-error-fallback
+
+;; delete all lines that don't either contain a span or an img tag
+;; (there are duplicates)
+(keep-lines "span class\\|img alt")
+
+;; do some manual cleanup from the hrefs and done