about summary refs log tree commit diff
path: root/users/aspen/xanthous/src/Xanthous/Generators/Speech.hs
diff options
context:
space:
mode:
authorAspen Smith <grfn@gws.fyi>2024-02-12T03·00-0500
committerclbot <clbot@tvl.fyi>2024-02-14T19·37+0000
commit82ecd61f5c699cf3af6c4eadf47a1c52b1d696c6 (patch)
tree429c5e078528000591742ec3211bc768ae913a78 /users/aspen/xanthous/src/Xanthous/Generators/Speech.hs
parent0ba476a4266015f278f18d74094299de74a5a111 (diff)
chore(users): grfn -> aspen r/7511
Change-Id: I6c6847fac56f0a9a1a2209792e00a3aec5e672b9
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10809
Autosubmit: aspen <root@gws.fyi>
Reviewed-by: sterni <sternenseemann@systemli.org>
Tested-by: BuildkiteCI
Reviewed-by: lukegb <lukegb@tvl.fyi>
Diffstat (limited to 'users/aspen/xanthous/src/Xanthous/Generators/Speech.hs')
-rw-r--r--users/aspen/xanthous/src/Xanthous/Generators/Speech.hs181
1 files changed, 181 insertions, 0 deletions
diff --git a/users/aspen/xanthous/src/Xanthous/Generators/Speech.hs b/users/aspen/xanthous/src/Xanthous/Generators/Speech.hs
new file mode 100644
index 000000000000..8abc00b6a2fc
--- /dev/null
+++ b/users/aspen/xanthous/src/Xanthous/Generators/Speech.hs
@@ -0,0 +1,181 @@
+{-# LANGUAGE TemplateHaskell #-}
+{-# LANGUAGE OverloadedLists #-}
+--------------------------------------------------------------------------------
+module Xanthous.Generators.Speech
+  ( -- * Language definition
+    Language(..)
+    -- ** Lenses
+  , phonotactics
+  , syllablesPerWord
+
+    -- ** Phonotactics
+  , Phonotactics(..)
+    -- *** Lenses
+  , onsets
+  , nuclei
+  , codas
+  , numOnsets
+  , numNuclei
+  , numCodas
+
+    -- * Language generation
+  , syllable
+  , word
+
+    -- * Languages
+  , english
+  , gormlak
+
+  ) where
+--------------------------------------------------------------------------------
+import           Xanthous.Prelude hiding (replicateM)
+import           Data.Interval (Interval, (<=..<=))
+import qualified Data.Interval as Interval
+import           Control.Monad.Random.Class (MonadRandom)
+import           Xanthous.Random (chooseRange, choose, ChooseElement (..), Weighted (Weighted))
+import           Control.Monad (replicateM)
+import           Test.QuickCheck (Arbitrary, CoArbitrary, Function)
+import           Test.QuickCheck.Instances.Text ()
+import           Data.List.NonEmpty (NonEmpty)
+--------------------------------------------------------------------------------
+
+newtype Phoneme = Phoneme Text
+  deriving stock (Show, Eq, Generic)
+  deriving anyclass (NFData, CoArbitrary, Function)
+  deriving newtype (IsString, Semigroup, Monoid, Arbitrary)
+
+-- | The phonotactics of a language
+--
+-- The phonotactics of a language represent the restriction on the phonemes in
+-- the syllables of a language.
+--
+-- Syllables in a language consist of an onset, a nucleus, and a coda (the
+-- nucleus and the coda together representing the "rhyme" of the syllable).
+data Phonotactics = Phonotactics
+  { _onsets    :: [Phoneme] -- ^ The permissible onsets, or consonant clusters
+                           --   at the beginning of a syllable
+  , _nuclei    :: [Phoneme] -- ^ The permissible nuclei, or vowel clusters in
+                           --   the middle of a syllable
+  , _codas     :: [Phoneme] -- ^ The permissible codas, or consonant clusters at
+                           --   the end of a syllable
+  , _numOnsets :: Interval Word -- ^ The range of number of allowable onsets
+  , _numNuclei :: Interval Word -- ^ The range of number of allowable nuclei
+  , _numCodas  :: Interval Word -- ^ The range of number of allowable codas
+  }
+  deriving stock (Show, Eq, Generic)
+  deriving anyclass (NFData)
+makeLenses ''Phonotactics
+
+-- | Randomly generate a syllable with the given 'Phonotactics'
+syllable :: MonadRandom m => Phonotactics -> m Text
+syllable phonotactics = do
+  let genPart num choices = do
+        n <- fromIntegral . fromMaybe 0 <$> chooseRange (phonotactics ^. num)
+        fmap (fromMaybe mempty . mconcat)
+          . replicateM n
+          . choose . ChooseElement
+          $ phonotactics ^. choices
+
+  (Phoneme onset) <- genPart numOnsets onsets
+  (Phoneme nucleus) <- genPart numNuclei nuclei
+  (Phoneme coda) <- genPart numCodas codas
+
+  pure $ onset <> nucleus <> coda
+
+-- | A definition for a language
+--
+-- Currently this provides enough information to generate multi-syllabic words,
+-- but in the future will likely also include grammar-related things.
+data Language = Language
+  { _phonotactics :: Phonotactics
+  , _syllablesPerWord :: Weighted Int NonEmpty Int
+  }
+  deriving stock (Show, Eq, Generic)
+  deriving anyclass (NFData)
+makeLenses ''Language
+
+word :: MonadRandom m => Language -> m Text
+word lang = do
+  numSyllables <- choose $ lang ^. syllablesPerWord
+  mconcat <$> replicateM numSyllables (syllable $ lang ^. phonotactics)
+
+--------------------------------------------------------------------------------
+
+-- <https://en.wikipedia.org/wiki/English_phonology#Phonotactics>
+englishPhonotactics :: Phonotactics
+englishPhonotactics = Phonotactics
+  { _onsets = [ "pl" , "bl" , "kl" , "gl" , "pr" , "br" , "tr" , "dr" , "kr"
+              , "gr" , "tw" , "dw" , "gw" , "kw" , "pw"
+
+              , "fl" , "sl" , {- "thl", -} "shl" {- , "vl" -}
+              , "p", "b", "t", "d", "k", "ɡ", "m", "n", "f", "v", "th", "s"
+              , "z", "h", "l", "w"
+
+              , "sp", "st", "sk"
+
+              , "sm", "sn"
+
+              , "sf", "sth"
+
+              , "spl", "skl", "spr", "str", "skr", "skw", "sm", "sp", "st", "sk"
+              ]
+  , _nuclei = [ "a", "e", "i", "o", "u", "ur", "ar", "or", "ear", "are", "ure"
+              , "oa", "ee", "oo", "ei", "ie", "oi", "ou"
+              ]
+  , _codas = [ "m", "n", "ng", "p", "t", "tsh", "k", "f", "sh", "s", "th", "x"
+             , "v", "z", "zh", "l", "r", "w"
+
+             , "lk", "lb", "lt", "ld", "ltsh", "ldsh", "lk"
+             , "rp", "rb", "rt", "rd", "rtsh", "rdsh", "rk", "rɡ"
+             , "lf", "lv", "lth", "ls", "lz", "lsh", "lth"
+             , "rf", "rv", "rth", "rs", "rz", "rth"
+             , "lm", "ln"
+             , "rm", "rn", "rl"
+             , "mp", "nt", "nd", "nth", "nsh", "nk"
+             , "mf", "ms", "mth", "nf", "nth", "ns", "nz", "nth"
+             , "ft", "sp", "st", "sk"
+             , "fth"
+             , "pt", "kt"
+             , "pth", "ps", "th", "ts", "dth", "dz", "ks"
+             , "lpt", "lps", "lfth", "lts", "lst", "lkt", "lks"
+             , "rmth", "rpt", "rps", "rts", "rst", "rkt"
+             , "mpt", "mps", "ndth", "nkt", "nks", "nkth"
+             , "ksth", "kst"
+             ]
+  , _numOnsets = 0 <=..<= 1
+  , _numNuclei = Interval.singleton 1
+  , _numCodas  = 0 <=..<= 1
+  }
+
+english :: Language
+english = Language
+  { _phonotactics = englishPhonotactics
+  , _syllablesPerWord = Weighted [(20, 1),
+                                  (7,  2),
+                                  (2,  3),
+                                  (1,  4)]
+  }
+
+gormlakPhonotactics :: Phonotactics
+gormlakPhonotactics = Phonotactics
+ { _onsets = [ "h", "l", "g", "b", "m", "n", "ng"
+             , "gl", "bl", "fl"
+             ]
+ , _numOnsets = Interval.singleton 1
+ , _nuclei = [ "a", "o", "aa", "u" ]
+ , _numNuclei = Interval.singleton 1
+ , _codas = [ "r", "l", "g", "m", "n"
+            , "rl", "gl", "ml", "rm"
+            , "n", "k"
+            ]
+ , _numCodas = Interval.singleton 1
+ }
+
+gormlak :: Language
+gormlak = Language
+  { _phonotactics = gormlakPhonotactics
+  , _syllablesPerWord = Weighted [ (5, 2)
+                                 , (5, 1)
+                                 , (1, 3)
+                                 ]
+  }