From b379e44dfb871100546b3d60bd3c77fbeac61adc Mon Sep 17 00:00:00 2001 From: sterni Date: Wed, 3 May 2023 18:19:41 +0200 Subject: refactor(3p/lisp/mime4cl): use flexi-streams and binary input This refactor is driven by the following (ultimate) aims: - Get rid of as much of the custom stream code in mime4cl which makes less code to maintain in the future. - Lay the groundwork for correct handling of 8bit transfer encoding: The mime4cl we inherited assumes that any MIME message can be decoded completely by the CL implementation (in SBCL's case using latin1) into CHARACTERs. This is not necessarily the case. flexi-streams allows changing how the stream is decoded on the fly and also has support for reading the underlying bytes which is perfect for the requirements decoding MIME has. - Since flexi-streams uses trivial-gray-streams, it supports READ-SEQUENCE. Taking advantage of this may improve decoding performance significantly in the future. This incurs the following changes: - Naturally we now open given files as binary files in MIME-MESSAGE. Given strings are encoded using STRING-TO-OCTETS and then passed on to a new octet vector method. Instead of MY-STRING-INPUT-STREAM this now uses flexi-streams' WITH-INPUT-FROM-SEQUENCE. - OPEN-FILE-PORTION and OPEN-DECODED-FILE-PORTION need to be merged, since the transfer encoding not only implies an extra decoder stream that needs to be attached after file portion stream, but also imply a certain encoding of the stream itself (mostly binary vs. ASCII). As flexi-streams can change their encoding on the fly this could be untangled again, but it is not strictly necessary. As before, we use the DATA slot of the file portion to create a fresh stream if possible. Instead of strings we now use an vector of octets to match MIME-MESSAGE. The actual portioned stream relies on POSITIONED-FLEXI-INPUT-STREAM, a subclass of the stock FLEXI-INPUT-STREAM class, described below. - POSITIONED-FLEXI-INPUT-STREAM replaces DELIMITED-INPUT-STREAM. It is created using MAKE-POSITIONED-FLEXI-INPUT-STREAM which accepts the same arguments as MAKE-FLEXI-STREAMS and, additionally, :IGNORE-CLOSE. A POSITIONED-FLEXI-INPUT-STREAM works the same as an FLEXI-INPUT-STREAM, but upon creation, the underlying stream is rewinded or forwarded to the argument given by :POSITION using FILE-POSITION. If :IGNORE-CLOSE is T, a call to CLOSE is not forwarded to the underlying stream. Change-Id: I2d48c769bb110ca0b7cf52441bd63c1e1c2ccd04 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8559 Reviewed-by: sterni Autosubmit: sterni Tested-by: BuildkiteCI --- third_party/lisp/mime4cl/mime.lisp | 47 ++++---- third_party/lisp/mime4cl/streams.lisp | 197 ++++++++++++++++------------------ 2 files changed, 115 insertions(+), 129 deletions(-) (limited to 'third_party/lisp/mime4cl') diff --git a/third_party/lisp/mime4cl/mime.lisp b/third_party/lisp/mime4cl/mime.lisp index b240b02f717c..ac828216cbcc 100644 --- a/third_party/lisp/mime4cl/mime.lisp +++ b/third_party/lisp/mime4cl/mime.lisp @@ -613,18 +613,18 @@ found in STREAM." (defgeneric decode-mime-body (part input-stream)) -(defmethod decode-mime-body ((part mime-part) (stream delimited-input-stream)) - (be base (base-stream stream) +(defmethod decode-mime-body ((part mime-part) (stream flexi-stream)) + (be base (flexi-stream-root-stream stream) (if *lazy-mime-decode* (setf (mime-body part) (make-file-portion :data (etypecase base - (my-string-input-stream - (stream-string base)) + (vector-stream + (flexi-streams::vector-stream-vector base)) (file-stream (pathname base))) :encoding (mime-encoding part) - :start (file-position stream) - :end (stream-end stream))) + :start (flexi-stream-position stream) + :end (flexi-stream-bound stream))) (call-next-method)))) (defmethod decode-mime-body ((part mime-part) (stream file-stream)) @@ -635,12 +635,12 @@ found in STREAM." :start (file-position stream))) (call-next-method))) -(defmethod decode-mime-body ((part mime-part) (stream my-string-input-stream)) +(defmethod decode-mime-body ((part mime-part) (stream vector-stream)) (if *lazy-mime-decode* (setf (mime-body part) - (make-file-portion :data (stream-string stream) + (make-file-portion :data (flexi-streams::vector-stream-vector stream) :encoding (mime-encoding part) - :start (file-position stream))) + :start (flexi-streams::vector-stream-index stream))) (call-next-method))) (defmethod decode-mime-body ((part mime-part) stream) @@ -658,11 +658,10 @@ list of MIME parts." (be *default-type* (if (eq :digest (mime-subtype part)) '("message" "rfc822" ()) '("text" "plain" (("charset" . "us-ascii")))) - in (make-instance 'delimited-input-stream - :underlying-stream stream - :dont-close t - :start start - :end end) + in (make-positioned-flexi-input-stream stream + :position start + :bound end + :ignore-close t) (read-mime-part in)))) offsets))))) @@ -754,17 +753,21 @@ returns a MIME-MESSAGE object." msg) (defmethod mime-message ((msg string)) - (with-open-stream (in (make-instance 'my-string-input-stream :string msg)) - (read-mime-message in))) + (mime-message (flexi-streams:string-to-octets msg))) -(defmethod mime-message ((msg stream)) - (read-mime-message msg)) +(defmethod mime-message ((msg vector)) + (with-input-from-sequence (in msg) + (mime-message in))) (defmethod mime-message ((msg pathname)) - (let (#+sbcl(sb-impl::*default-external-format* :latin-1) - #+sbcl(sb-alien::*default-c-string-external-format* :latin-1)) - (with-open-file (in msg) - (read-mime-message in)))) + (with-open-file (in msg :element-type '(unsigned-byte 8)) + (mime-message in))) + +(defmethod mime-message ((msg flexi-stream)) + (read-mime-message msg)) + +(defmethod mime-message ((msg stream)) + (read-mime-message (make-flexi-stream msg))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/third_party/lisp/mime4cl/streams.lisp b/third_party/lisp/mime4cl/streams.lisp index 99fea5e422a7..055104b5c422 100644 --- a/third_party/lisp/mime4cl/streams.lisp +++ b/third_party/lisp/mime4cl/streams.lisp @@ -235,121 +235,104 @@ in a stream of character.")) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(defclass delimited-input-stream (fundamental-character-input-stream coder-stream-mixin) - ((start-offset :initarg :start - :initform 0 - :reader stream-start - :type integer) - (end-offset :initarg :end - :initform nil - :reader stream-end - :type (or null integer)) - (current-offset :type integer))) - -(defmethod print-object ((object delimited-input-stream) stream) - (if *print-readably* - (call-next-method) - (with-slots (start-offset end-offset) object - (print-unreadable-object (object stream :type t :identity t) - (format stream "start=~A end=~A" start-offset end-offset))))) - -(defun base-stream (stream) - (if (typep stream 'delimited-input-stream) - (base-stream (real-stream stream)) - stream)) - -(defmethod initialize-instance ((stream delimited-input-stream) &key &allow-other-keys) - (call-next-method) - (unless (slot-boundp stream 'real-stream) - (error "REAL-STREAM is unbound. Must provide a :UNDERLYING-STREAM argument.")) - (with-slots (start-offset) stream - (file-position stream start-offset))) - -(defmethod (setf stream-file-position) (newval (stream delimited-input-stream)) - (with-slots (current-offset real-stream) stream - (setf current-offset newval) - (call-next-method))) - -(defmethod stream-file-position ((stream delimited-input-stream)) - (slot-value stream 'current-offset)) - -;; Calling file-position with SBCL on every read is quite expensive, since -;; it will invoke lseek each time. This is so expensive that it's faster to -;; /compute/ the amount the stream got advanced by. -;; file-position's behavior however, is quite flexible and it behaves differently -;; not only for different implementation, but also different streams in SBCL. -;; Thus, we should ideally go back to file-position and try to reduce the amount -;; of calls by using read-sequence. -;; TODO(sterni): make decoders use read-sequence and drop offset tracking code -(macrolet ((def-stream-read (name read-fun update-offset-form) - `(defmethod ,name ((stream delimited-input-stream)) - (with-slots (real-stream end-offset current-offset) stream - (let ((el (if (or (not end-offset) - (< current-offset end-offset)) - (or (,read-fun real-stream nil) - :eof) - :eof))) - (setf current-offset ,update-offset-form) - el))))) - - ;; Assume we are using an encoding where < 128 is one byte, in all other cases - ;; it's hard to guess how much file-position will increase - (def-stream-read stream-read-char read-char - (if (or (eq el :eof) (< (char-code el) 128)) - (1+ current-offset) - (file-position real-stream))) - - (def-stream-read stream-read-byte read-byte (1+ current-offset))) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(defclass my-string-input-stream (fundamental-character-input-stream coder-stream-mixin) - ((string :initarg :string - :reader stream-string))) - -(defmethod initialize-instance ((stream my-string-input-stream) &key &allow-other-keys) +(defclass positioned-flexi-input-stream (flexi-input-stream) + ((ignore-close + :initform nil + :initarg :ignore-close + :documentation + "If T, calling CLOSE on the stream does nothing. +If NIL, the underlying stream is closed.")) + (:documentation + "FLEXI-INPUT-STREAM that automatically advances the underlying :STREAM to +the location given by :POSITION. This uses FILE-POSITION internally, so it'll +only works if the underlying stream position is tracked in bytes. Note that +the underlying stream is still advanced, so having multiple instances of +POSITIONED-FLEXI-INPUT-STREAM based with the same underlying stream won't work +reliably. +If :IGNORE-CLOSE is set, the underlying stream won't be closed if CLOSE is +called on the POSITIONED-FLEXI-INPUT-STREAM.")) + +(defmethod initialize-instance ((stream positioned-flexi-input-stream) + &key &allow-other-keys) (call-next-method) - (assert (slot-boundp stream 'string)) - (with-slots (string real-stream) stream - (setf real-stream (make-string-input-stream string)))) - -(defmethod stream-read-char ((stream my-string-input-stream)) - (with-slots (real-stream) stream - (or (read-char real-stream nil) - :eof))) + ;; The :POSITION initarg is only informational for flexi-streams: It assumes + ;; it is were the stream it got is already at and continuously updates it + ;; for querying (via FLEXI-STREAM-POSITION) and bound checking. + ;; Since we have streams that are not positioned correctly, we need to do this + ;; here using FILE-POSITION. Note that assumes the underlying implementation + ;; uses bytes for FILE-POSITION which is not guaranteed (probably some streams + ;; even in SBCL don't). + (file-position (flexi-stream-stream stream) (flexi-stream-position stream))) + +(defmethod close ((stream positioned-flexi-input-stream) &key abort) + (declare (ignore abort)) + (with-slots (ignore-close) stream + (unless ignore-close + (call-next-method)))) + +(defun make-positioned-flexi-input-stream (stream &rest args) + "Create a POSITIONED-FLEXI-INPUT-STREAM. Accepts the same keyword arguments as +MAKE-FLEXI-STREAM as well as :IGNORE-CLOSE. Causes the FILE-POSITION of STREAM to +be modified to match the :POSITION argument." + (apply #'make-instance + 'positioned-flexi-input-stream + :stream stream + (mapcar (lambda (x) + ;; make-flexi-stream has a discrepancy between :initarg of + ;; make-instance and its &key which we mirror here. + (if (eq x :external-format) :flexi-stream-external-format x)) + args))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; TODO(sterni): test correct behavior with END NIL (defstruct file-portion data ; string or a pathname encoding start end) -(defun open-file-portion (file-portion) - (be data (file-portion-data file-portion) - (etypecase data - (pathname - (be stream (open data) - (make-instance 'delimited-input-stream - :underlying-stream stream - :start (file-portion-start file-portion) - :end (file-portion-end file-portion)))) - (string - (make-instance 'delimited-input-stream - :underlying-stream (make-string-input-stream data) - :start (file-portion-start file-portion) - :end (file-portion-end file-portion))) - (stream - (make-instance 'delimited-input-stream - :underyling-stream data - :dont-close t - :start (file-portion-start file-portion) - :end (file-portion-end file-portion)))))) - (defun open-decoded-file-portion (file-portion) - (make-instance (case (file-portion-encoding file-portion) - (:quoted-printable 'quoted-printable-decoder-stream) - (:base64 'base64-decoder-stream) - (t '8bit-decoder-stream)) - :underlying-stream (open-file-portion file-portion))) + (with-slots (data encoding start end) + file-portion + (let* ((binary-stream + (etypecase data + (pathname + (open data :element-type '(unsigned-byte 8))) + ((vector (unsigned-byte 8)) + (flexi-streams:make-in-memory-input-stream data)) + (stream + ;; TODO(sterni): assert that bytes/flexi-stream + data))) + (params (ccase encoding + ((:quoted-printable :base64) '(:external-format :us-ascii)) + (:8bit '(:element-type (unsigned-byte 8))) + (:7bit '(:external-format :us-ascii)))) + (portion-stream (apply #'make-positioned-flexi-input-stream + binary-stream + :position start + :bound end + ;; if data is a stream we can't have a + ;; FILE-PORTION without modifying it when + ;; reading etc. The least we can do, though, + ;; is forgo destroying it. + :ignore-close (typep data 'stream) + params)) + (needs-decoder-stream (member encoding '(:quoted-printable + :base64)))) + + (if needs-decoder-stream + (make-instance + (ccase encoding + (:quoted-printable 'quoted-printable-decoder-stream) + (:base64 'base64-decoder-stream)) + :underlying-stream portion-stream) + portion-stream)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(defun flexi-stream-root-stream (stream) + "Return the non FLEXI-STREAM stream a given chain of FLEXI-STREAMs is based on." + (if (typep stream 'flexi-stream) + (flexi-stream-root-stream (flexi-stream-stream stream)) + stream)) -- cgit 1.4.1