users/aspen/xanthous/src/Xanthous/Generators/Speech.hs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE OverloadedLists #-}
--------------------------------------------------------------------------------
module Xanthous.Generators.Speech
  ( -- * Language definition
    Language(..)
    -- ** Lenses
  , phonotactics
  , syllablesPerWord

    -- ** Phonotactics
  , Phonotactics(..)
    -- *** Lenses
  , onsets
  , nuclei
  , codas
  , numOnsets
  , numNuclei
  , numCodas

    -- * Language generation
  , syllable
  , word

    -- * Languages
  , english
  , gormlak

  ) where
--------------------------------------------------------------------------------
import           Xanthous.Prelude hiding (replicateM)
import           Data.Interval (Interval, (<=..<=))
import qualified Data.Interval as Interval
import           Control.Monad.Random.Class (MonadRandom)
import           Xanthous.Random (chooseRange, choose, ChooseElement (..), Weighted (Weighted))
import           Control.Monad (replicateM)
import           Test.QuickCheck (Arbitrary, CoArbitrary, Function)
import           Test.QuickCheck.Instances.Text ()
import           Data.List.NonEmpty (NonEmpty)
--------------------------------------------------------------------------------

newtype Phoneme = Phoneme Text
  deriving stock (Show, Eq, Generic)
  deriving anyclass (NFData, CoArbitrary, Function)
  deriving newtype (IsString, Semigroup, Monoid, Arbitrary)

-- | The phonotactics of a language
--
-- The phonotactics of a language represent the restriction on the phonemes in
-- the syllables of a language.
--
-- Syllables in a language consist of an onset, a nucleus, and a coda (the
-- nucleus and the coda together representing the "rhyme" of the syllable).
data Phonotactics = Phonotactics
  { _onsets    :: [Phoneme] -- ^ The permissible onsets, or consonant clusters
                           --   at the beginning of a syllable
  , _nuclei    :: [Phoneme] -- ^ The permissible nuclei, or vowel clusters in
                           --   the middle of a syllable
  , _codas     :: [Phoneme] -- ^ The permissible codas, or consonant clusters at
                           --   the end of a syllable
  , _numOnsets :: Interval Word -- ^ The range of number of allowable onsets
  , _numNuclei :: Interval Word -- ^ The range of number of allowable nuclei
  , _numCodas  :: Interval Word -- ^ The range of number of allowable codas
  }
  deriving stock (Show, Eq, Generic)
  deriving anyclass (NFData)
makeLenses ''Phonotactics

-- | Randomly generate a syllable with the given 'Phonotactics'
syllable :: MonadRandom m => Phonotactics -> m Text
syllable phonotactics = do
  let genPart num choices = do
        n <- fromIntegral . fromMaybe 0 <$> chooseRange (phonotactics ^. num)
        fmap (fromMaybe mempty . mconcat)
          . replicateM n
          . choose . ChooseElement
          $ phonotactics ^. choices

  (Phoneme onset) <- genPart numOnsets onsets
  (Phoneme nucleus) <- genPart numNuclei nuclei
  (Phoneme coda) <- genPart numCodas codas

  pure $ onset <> nucleus <> coda

-- | A definition for a language
--
-- Currently this provides enough information to generate multi-syllabic words,
-- but in the future will likely also include grammar-related things.
data Language = Language
  { _phonotactics :: Phonotactics
  , _syllablesPerWord :: Weighted Int NonEmpty Int
  }
  deriving stock (Show, Eq, Generic)
  deriving anyclass (NFData)
makeLenses ''Language

word :: MonadRandom m => Language -> m Text
word lang = do
  numSyllables <- choose $ lang ^. syllablesPerWord
  mconcat <$> replicateM numSyllables (syllable $ lang ^. phonotactics)

--------------------------------------------------------------------------------

-- <https://en.wikipedia.org/wiki/English_phonology#Phonotactics>
englishPhonotactics :: Phonotactics
englishPhonotactics = Phonotactics
  { _onsets = [ "pl" , "bl" , "kl" , "gl" , "pr" , "br" , "tr" , "dr" , "kr"
              , "gr" , "tw" , "dw" , "gw" , "kw" , "pw"

              , "fl" , "sl" , {- "thl", -} "shl" {- , "vl" -}
              , "p", "b", "t", "d", "k", "ɡ", "m", "n", "f", "v", "th", "s"
              , "z", "h", "l", "w"

              , "sp", "st", "sk"

              , "sm", "sn"

              , "sf", "sth"

              , "spl", "skl", "spr", "str", "skr", "skw", "sm", "sp", "st", "sk"
              ]
  , _nuclei = [ "a", "e", "i", "o", "u", "ur", "ar", "or", "ear", "are", "ure"
              , "oa", "ee", "oo", "ei", "ie", "oi", "ou"
              ]
  , _codas = [ "m", "n", "ng", "p", "t", "tsh", "k", "f", "sh", "s", "th", "x"
             , "v", "z", "zh", "l", "r", "w"

             , "lk", "lb", "lt", "ld", "ltsh", "ldsh", "lk"
             , "rp", "rb", "rt", "rd", "rtsh", "rdsh", "rk", "rɡ"
             , "lf", "lv", "lth", "ls", "lz", "lsh", "lth"
             , "rf", "rv", "rth", "rs", "rz", "rth"
             , "lm", "ln"
             , "rm", "rn", "rl"
             , "mp", "nt", "nd", "nth", "nsh", "nk"
             , "mf", "ms", "mth", "nf", "nth", "ns", "nz", "nth"
             , "ft", "sp", "st", "sk"
             , "fth"
             , "pt", "kt"
             , "pth", "ps", "th", "ts", "dth", "dz", "ks"
             , "lpt", "lps", "lfth", "lts", "lst", "lkt", "lks"
             , "rmth", "rpt", "rps", "rts", "rst", "rkt"
             , "mpt", "mps", "ndth", "nkt", "nks", "nkth"
             , "ksth", "kst"
             ]
  , _numOnsets = 0 <=..<= 1
  , _numNuclei = Interval.singleton 1
  , _numCodas  = 0 <=..<= 1
  }

english :: Language
english = Language
  { _phonotactics = englishPhonotactics
  , _syllablesPerWord = Weighted [(20, 1),
                                  (7,  2),
                                  (2,  3),
                                  (1,  4)]
  }

gormlakPhonotactics :: Phonotactics
gormlakPhonotactics = Phonotactics
 { _onsets = [ "h", "l", "g", "b", "m", "n", "ng"
             , "gl", "bl", "fl"
             ]
 , _numOnsets = Interval.singleton 1
 , _nuclei = [ "a", "o", "aa", "u" ]
 , _numNuclei = Interval.singleton 1
 , _codas = [ "r", "l", "g", "m", "n"
            , "rl", "gl", "ml", "rm"
            , "n", "k"
            ]
 , _numCodas = Interval.singleton 1
 }

gormlak :: Language
gormlak = Language
  { _phonotactics = gormlakPhonotactics
  , _syllablesPerWord = Weighted [ (5, 2)
                                 , (5, 1)
                                 , (1, 3)
                                 ]
  }