diff options
Diffstat (limited to 'fun/amsterdump/scrape.el')
-rw-r--r-- | fun/amsterdump/scrape.el | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/fun/amsterdump/scrape.el b/fun/amsterdump/scrape.el new file mode 100644 index 000000000000..f5537c2c8f17 --- /dev/null +++ b/fun/amsterdump/scrape.el @@ -0,0 +1,25 @@ +;; Scraping funda.nl (this file is just notes and snippets, not full code) +;; +;; Begin by copying whole page into buffer (out of inspect element +;; because encoding is difficult) + +(beginning-of-buffer) + +;; zap everything that isn't a relevant result +(keep-lines "data-object-url-tracking\\|img alt") + +;; mark all spans, move them to the end of the buffer +(cl-letf (((symbol-function 'read-regexp) + (lambda (&rest _) "</span>"))) + (mc/mark-all-in-region-regexp (point-min) (point-max))) + +;; mark all images lines (these contain street addresses for things +;; with images), clear up and join with previous +;; +;; mark all: data-image-error-fallback + +;; delete all lines that don't either contain a span or an img tag +;; (there are duplicates) +(keep-lines "span class\\|img alt") + +;; do some manual cleanup from the hrefs and done |