about summary refs log tree commit diff
path: root/fun/amsterdump/scrape.el
diff options
context:
space:
mode:
authorVincent Ambo <Vincent Ambo>2020-01-05T21·09+0000
committerVincent Ambo <Vincent Ambo>2020-01-05T21·10+0000
commit7b77e9986c5bbedce6bc0a0154d253f71c9f2548 (patch)
treef28545c7409984057f792e8530a99f180de28e01 /fun/amsterdump/scrape.el
parentb8ca70539b8f3ec8dc05ea1d0a806307d54b2e3c (diff)
feat(fun/amsterdump): Add distance matrix lookup for fundu results r/337
This contains a little tool that can make requests to the Google Maps
API for distance matrix lookups from Fundu results to Schiphol Airport
and Amsterdam Centraal.

<3 edef!
Diffstat (limited to 'fun/amsterdump/scrape.el')
-rw-r--r--fun/amsterdump/scrape.el25
1 files changed, 25 insertions, 0 deletions
diff --git a/fun/amsterdump/scrape.el b/fun/amsterdump/scrape.el
new file mode 100644
index 0000000000..f5537c2c8f
--- /dev/null
+++ b/fun/amsterdump/scrape.el
@@ -0,0 +1,25 @@
+;; Scraping funda.nl (this file is just notes and snippets, not full code)
+;;
+;; Begin by copying whole page into buffer (out of inspect element
+;; because encoding is difficult)
+
+(beginning-of-buffer)
+
+;; zap everything that isn't a relevant result
+(keep-lines "data-object-url-tracking\\|img alt")
+
+;; mark all spans, move them to the end of the buffer
+(cl-letf (((symbol-function 'read-regexp)
+           (lambda (&rest _) "</span>")))
+  (mc/mark-all-in-region-regexp (point-min) (point-max)))
+
+;; mark all images lines (these contain street addresses for things
+;; with images), clear up and join with previous
+;;
+;; mark all: data-image-error-fallback
+
+;; delete all lines that don't either contain a span or an img tag
+;; (there are duplicates)
+(keep-lines "span class\\|img alt")
+
+;; do some manual cleanup from the hrefs and done