diff options
author | Profpatsch <mail@profpatsch.de> | 2023-06-04T00·34+0200 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2023-07-13T23·03+0000 |
commit | 9a91669ba75955dbb20300c7ef44e72274750a95 (patch) | |
tree | f00df63c5769d65eab009e0c9035df034e2299ef /users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs | |
parent | c2baefbecc22986338563775524d41bb2f522bf1 (diff) |
feat(users/Profpatsch/jbovlaste-sqlite): add XML parser r/6411
nice. Change-Id: Iea90578742bfb689cd0508dbaf641c31aed577ad Reviewed-on: https://cl.tvl.fyi/c/depot/+/8709 Tested-by: BuildkiteCI Autosubmit: Profpatsch <mail@profpatsch.de> Reviewed-by: Profpatsch <mail@profpatsch.de>
Diffstat (limited to 'users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs')
-rw-r--r-- | users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs | 298 |
1 files changed, 285 insertions, 13 deletions
diff --git a/users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs b/users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs index 3d2811865a77..3fdd3076ed55 100644 --- a/users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs +++ b/users/Profpatsch/jbovlaste-sqlite/JbovlasteSqlite.hs @@ -2,27 +2,136 @@ module Main where +import Control.Category qualified +import Control.Category qualified as Cat +import Control.Selective (Selective) import Data.ByteString.Internal qualified as Bytes import Data.Error.Tree +import Data.Functor.Compose import Data.List qualified as List import Data.Map.Strict qualified as Map +import Data.Maybe (catMaybes) +import Data.Monoid (First (..)) +import Data.Semigroup.Traversable +import Data.Semigroupoid qualified as Semigroupoid import Data.Text qualified as Text import Data.Text.IO qualified as Text +import FieldParser (FieldParser) +import FieldParser qualified as Field import Label import PossehlAnalyticsPrelude +import Pretty import Text.XML (def) import Text.XML qualified as Xml +import Validation (partitionValidations) +import Prelude hiding (maybe) +import Prelude qualified main :: IO () main = do f <- file f.documentRoot - & filterElementsRec noUsers - & downTo (T2 (label @"maxdepth" 5) (label @"maxlistitems" 20)) + & filterDown & toTree & prettyErrorTree & Text.putStrLn +filterDown :: Xml.Element -> Xml.Element +filterDown el = + el + & filterElementsRec noUsers + & downTo (T2 (label @"maxdepth" 5) (label @"maxlistitems" 30)) + +data Valsi = Valsi + { definition :: Text, + definitionId :: Natural, + typ :: Text, + selmaho :: Maybe Text, + notes :: Maybe Text, + glosswords :: [T2 "word" Text "sense" (Maybe Text)], + keywords :: [T3 "word" Text "place" Natural "sense" (Maybe Text)] + } + deriving stock (Show) + +test :: IO () +test = do + f <- file + parseJbovlasteXml f + & \case + Left errs -> Text.putStrLn $ prettyErrorTree errs + Right el -> do + el & traverse_ printPretty + +parseJbovlasteXml :: HasField "documentRoot" r Xml.Element => r -> Either ErrorTree [Valsi] +parseJbovlasteXml xml = + xml.documentRoot + & runParse + "uh oh" + ( ( element "dictionary" <&> (.elementNodes) <&> mapMaybe nodeElementMay + ) + >>> ( ( find + ( element "direction" + >>> ( ( do + (attribute "from" >>> exactly showToText "lojban") + *> (attribute "to" >>> exactly showToText "English") + *> Cat.id + ) + ) + ) + >>> dimap + (\x -> x.elementNodes <&> nodeElementMay) + (catMaybes) + ( multiple + (\idx _ -> [fmt|{idx}|]) + ( maybe $ + (element "valsi") + >>> do + let subNodes = + ( Cat.id + <&> (.elementNodes) + <&> mapMaybe nodeElementMay + ) + + let subElementContent elName = + subNodes + >>> ( (find (element elName)) + <&> (.elementNodes) + ) + >>> exactlyOne + >>> content + let optionalSubElementContent elName = + subNodes + >>> ((findAll (element elName) >>> zeroOrOne)) + >>> (maybe (lmap (.elementNodes) exactlyOne >>> content)) + + typ <- attribute "type" + selmaho <- optionalSubElementContent "selmaho" + definition <- subElementContent "definition" + definitionId <- subElementContent "definitionid" >>> fieldParser Field.decimalNatural + notes <- optionalSubElementContent "notes" + glosswords <- + (subNodes >>> findAll (element "glossword")) + >>> ( multiple (\idx _ -> [fmt|{idx}|]) $ do + word <- label @"word" <$> (attribute "word") + sense <- label @"sense" <$> (attributeMay "sense") + pure $ T2 word sense + ) + keywords <- + (subNodes >>> findAll (element "keyword")) + >>> ( multiple (\idx _ -> [fmt|{idx}|]) $ do + word <- label @"word" <$> (attribute "word") + place <- label @"place" <$> (attribute "place" >>> fieldParser Field.decimalNatural) + sense <- label @"sense" <$> (attributeMay "sense") + pure $ T3 word place sense + ) + + pure $ Valsi {..} + ) + ) + ) + ) + ) + file :: IO Xml.Document file = Xml.readFile def "./jbovlaste-en.xml" @@ -75,15 +184,10 @@ downTo n el = toTree :: Xml.Element -> ErrorTree toTree el = do - let outer = - if not $ null el.elementAttributes - then [fmt|<{name el.elementName}: {attrs el.elementAttributes}>|] - else [fmt|<{name el.elementName}>|] - case el.elementNodes & filter (not . isEmptyContent) & nonEmpty of - Nothing -> singleError (newError outer) - Just (n :| []) | not $ isElementNode n -> singleError $ errorContext outer (nodeErrorNoElement n) - Just nodes -> nestedMultiError (newError outer) (nodes <&> node) + Nothing -> singleError (newError (prettyXmlElement el)) + Just (n :| []) | not $ isElementNode n -> singleError $ errorContext (prettyXmlElement el) (nodeErrorNoElement n) + Just nodes -> nestedMultiError (newError (prettyXmlElement el)) (nodes <&> node) where isEmptyContent = \case Xml.NodeContent c -> c & Text.all Bytes.isSpaceChar8 @@ -104,7 +208,175 @@ toTree el = do Xml.NodeComment c -> [fmt|<!-- {c} -->|] Xml.NodeElement _ -> error "NodeElement not allowed here" - name :: Xml.Name -> Text - name n = [fmt|{n.namePrefix & fromMaybe ""}{n.nameLocalName}|] +prettyXmlName :: Xml.Name -> Text +prettyXmlName n = [fmt|{n.namePrefix & fromMaybe ""}{n.nameLocalName}|] + +prettyXmlElement :: Xml.Element -> Text +prettyXmlElement el = + if not $ null el.elementAttributes + then [fmt|<{prettyXmlName el.elementName}: {attrs el.elementAttributes}>|] + else [fmt|<{prettyXmlName el.elementName}>|] + where attrs :: Map Xml.Name Text -> Text - attrs a = a & Map.toList <&> (\(k, v) -> [fmt|{name k}={v}|]) & Text.intercalate ", " & \s -> [fmt|({s})|] + attrs a = a & Map.toList <&> (\(k, v) -> [fmt|{prettyXmlName k}={v}|]) & Text.intercalate ", " & \s -> [fmt|({s})|] + +nodeElementMay :: Xml.Node -> Maybe Xml.Element +nodeElementMay = \case + Xml.NodeElement el -> Just el + _ -> Nothing + +newtype Context = Context (Maybe [Text]) + deriving stock (Show) + deriving (Semigroup, Monoid) via (First [Text]) + +newtype Parse from to = Parse ((Context, from) -> Validation (NonEmpty ErrorTree) (Context, to)) + deriving + (Functor, Applicative, Selective) + via ( Compose + ( Compose + ((->) (Context, from)) + (Validation (NonEmpty ErrorTree)) + ) + ((,) Context) + ) + +instance Semigroupoid Parse where + o p2 p1 = Parse $ \from -> case runParse' p1 from of + Failure err -> Failure err + Success to1 -> runParse' p2 to1 + +instance Category Parse where + (.) = Semigroupoid.o + id = Parse $ \t -> Success t + +instance Profunctor Parse where + lmap f (Parse p) = Parse $ lmap (second f) p + rmap = (<$>) + +runParse :: Error -> Parse from to -> from -> Either ErrorTree to +runParse errMsg parser t = + (Context (Just ["$"]), t) + & runParse' parser + <&> snd + & first (nestedMultiError errMsg) + & validationToEither + +runParse' :: Parse from to -> (Context, from) -> Validation (NonEmpty ErrorTree) (Context, to) +runParse' (Parse f) from = f from + +showContext :: Context -> Text +showContext (Context context) = context & fromMaybe [] & List.reverse & Text.intercalate "." + +addContext :: Text -> Context -> Context +addContext x (Context mxs) = Context (Just $ x : (mxs & fromMaybe [])) + +element :: Text -> Parse Xml.Element Xml.Element +element name = Parse $ \(ctx, el) -> + if el.elementName.nameLocalName == name + then Success (ctx & addContext (prettyXmlName el.elementName), el) + else Failure $ singleton [fmt|Expected element named <{name}> but got {el & prettyXmlElement} at {showContext ctx}|] + +content :: Parse Xml.Node Text +content = Parse $ \(ctx, node) -> case node of + Xml.NodeContent t -> Success (ctx, t) + -- TODO: give an example of the node content? + n -> Failure $ singleton [fmt|Expected a content node, but got a {n & nodeType} node, at {showContext ctx}|] + where + nodeType = \case + Xml.NodeContent _ -> "content" :: Text + Xml.NodeComment _ -> "comment" + Xml.NodeInstruction _ -> "instruction" + Xml.NodeElement _ -> "element" + +attribute :: Text -> Parse Xml.Element Text +attribute name = Parse $ \(ctx, el) -> + case el.elementAttributes & Map.mapKeys (.nameLocalName) & Map.lookup name of + Just a -> Success (ctx & addContext [fmt|{{attr:{name}}}|], a) + Nothing -> Failure $ singleton [fmt|Attribute "{name}" missing at {showContext ctx}|] + +attributeMay :: Text -> Parse Xml.Element (Maybe Text) +attributeMay name = Parse $ \(ctx, el) -> + case el.elementAttributes & Map.mapKeys (.nameLocalName) & Map.lookup name of + Just a -> Success (ctx & addContext [fmt|{{attr:{name}}}|], Just a) + Nothing -> Success (ctx, Nothing) + +-- | 'oneOf' but only one value possible +exactly :: Eq from => (from -> Text) -> from -> Parse from from +exactly errDisplay from = Parse $ \(ctx, from') -> + if from == from' + then Success (ctx, from') + else Failure $ singleton [fmt|Field has to be exactly {errDisplay from}, was: {errDisplay from'} at {showContext ctx}|] + +multiple :: (Natural -> a1 -> Error) -> Parse a1 a2 -> Parse [a1] [a2] +multiple errorFn inner = dimap nonEmpty (Prelude.maybe [] toList) (maybe $ multipleNE errorFn inner) + +multipleNE :: (Natural -> from -> Error) -> Parse from to -> Parse (NonEmpty from) (NonEmpty to) +multipleNE errorFn inner = Parse $ \(ctx, from) -> + from + & zipIndex + & traverse (\(idx, f) -> runParse' inner (ctx, f) & first (singleton . nestedMultiError (errorFn idx f))) + -- we assume that, since the same parser is used everywhere, the context will be the same as well (TODO: correct?) + & second (\((ctx', y) :| ys) -> (ctx', y :| (snd <$> ys))) + +maybe :: Parse from to -> Parse (Maybe from) (Maybe to) +maybe inner = Parse $ \(ctx, m) -> case m of + Nothing -> Success (ctx, Nothing) + Just a -> runParse' inner (ctx, a) & second (fmap Just) + +exactlyOne :: Parse [from] from +exactlyOne = Parse $ \(ctx, xs) -> case xs of + [] -> Failure $ singleton [fmt|Expected exactly 1 element, but got 0, at {ctx & showContext}|] + [one] -> Success (ctx, one) + _more -> Failure $ singleton [fmt|Expected exactly 1 element, but got 2, at {ctx & showContext}|] + +zeroOrOne :: Parse [from] (Maybe from) +zeroOrOne = Parse $ \(ctx, xs) -> case xs of + [] -> Success (ctx, Nothing) + [one] -> Success (ctx, Just one) + _more -> Failure $ singleton [fmt|Expected exactly 1 element, but got 2, at {ctx & showContext}|] + +find :: Parse from to -> Parse [from] to +find inner = Parse $ \(ctx, xs) -> case xs of + [] -> failure [fmt|Wanted to get the first sub-parser that succeeds, but there were no elements in the list, at {ctx & showContext}|] + (y : ys) -> runParse' (findNE' inner) (ctx, y :| ys) + +findNE' :: Parse from to -> Parse (NonEmpty from) to +findNE' inner = Parse $ \(ctx, xs) -> + xs + <&> (\x -> runParse' inner (ctx, x)) + & traverse1 + ( \case + Success a -> Left a + Failure e -> Right e + ) + & \case + Left a -> Success a + Right errs -> + errs + & zipIndex + <&> (\(idx, errs') -> nestedMultiError [fmt|{idx}|] errs') + & nestedMultiError [fmt|None of these sub-parsers succeeded|] + & singleton + & Failure + +findAll :: Parse from to -> Parse [from] [to] +findAll inner = Parse $ \(ctx, xs) -> + xs + <&> (\x -> runParse' inner (ctx, x)) + & partitionValidations + & \case + (_miss, []) -> + -- in this case we just arbitrarily forward the original context … + Success (ctx, []) + (_miss, (hitCtx, hit) : hits) -> Success (hitCtx, hit : (hits <&> snd)) + +fieldParser :: FieldParser from to -> Parse from to +fieldParser fp = Parse $ \(ctx, from) -> case Field.runFieldParser fp from of + Right a -> Success (ctx, a) + Left err -> Failure $ singleton (singleError err) + +zipNonEmpty :: NonEmpty a -> NonEmpty b -> NonEmpty (a, b) +zipNonEmpty (x :| xs) (y :| ys) = (x, y) :| zip xs ys + +zipIndex :: NonEmpty b -> NonEmpty (Natural, b) +zipIndex = zipNonEmpty (1 :| [2 :: Natural ..]) |