summaryrefslogtreecommitdiffstats
path: root/src/TextViewport/Render/Segmentation.hs
blob: 584798ec1734d078a298d7621ee7edc1bbe0f309 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
module TextViewport.Render.Segmentation where

import Data.DList qualified as DL
import Data.HashMap.Strict qualified as HM
import Data.List (minimumBy)
import Data.Ord (comparing)
import Data.Text (Text)
import Data.Text qualified as T
import Data.Vector (Vector)
import Data.Vector qualified as V
import Text.Hyphenation qualified as H
import TextViewport.Buffer.Item
import TextViewport.Render.RenderedLine (RenderedLine(..)) -- TODO qualified

applyStrategy :: SegmentStrategy -> Int -> Int -> Text -> Vector RenderedLine

applyStrategy NoSegments width itemIx txt =
  let rawLines = T.splitOn "\n" txt
      chunks  = map (T.take width) rawLines -- crop
      offsets = scanOffsetsWithNewlines chunks
  in V.fromList
       [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
       | (lineIx, (off, chunk)) <- zip [0..] (zip offsets chunks)
       ]

applyStrategy FixedWidthSegments width itemIx txt =
  let rawLines = T.splitOn "\n" txt
      (dl, _) = foldl step (DL.empty, 0) rawLines
      step (acc, off0) line =
        let chunks   = chunkFixed width line
            offsets  = scanOffsetsFrom off0 chunks
            offNext  = off0 + T.length line + 1
            acc'     = acc `DL.append` DL.fromList (zip offsets chunks)
        in (acc', offNext)
      allChunks = DL.toList dl
  in V.fromList
       [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
       | (lineIx, (off, chunk)) <- zip [0..] allChunks
       ]

applyStrategy (HyphenateSegments dict cache0) width itemIx txt =
  let rawLines = T.splitOn "\n" txt

      -- fold over each physical line, accumulating:
      --   * all rendered (offset, chunk) pairs
      --   * updated hyphenation cache (unused for now)
      --   * running character offset across lines
      (dl, _cache1, _) =
        foldl segmentOneLine (DL.empty, cache0, 0) rawLines
      allChunks = DL.toList dl
  in V.fromList
       [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
       | (lineIx, (off, chunk)) <- zip [0..] allChunks
       ]
  where
    -- Segment a single physical line using TeX‑lite hyphenation
    --segmentOneLine
    --  :: ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int)
    --  -> Text
    --  -> ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int)
    segmentOneLine (acc, cache, off0) line =
      let (chunks, cache1) = segmentWithHyphenationTeXLite dict width line cache
          offsets          = scanOffsetsFrom off0 chunks
          offNext          = off0 + T.length line + 1
          acc'             = acc `DL.append` DL.fromList (zip offsets chunks)
      in (acc', cache1, offNext)
    --segmentOneLine (acc, cache, off0) line =
    --  let chunks  = segmentWithHyphenationTeXLite dict width line
    --      offsets = scanOffsetsFrom off0 chunks
    --      offNext = off0 + T.length line + 1   -- +1 for newline
    --      acc'    = acc ++ zip offsets chunks
    --  in (acc', cache, offNext)

-- | Compute running character offsets for a list of chunks.
scanOffsetsWithNewlines :: [Text] -> [Int]
scanOffsetsWithNewlines = go 0
  where
    go !_ []     = []
    go !o (l:ls) =
      let off = o
          o'  = o + T.length l + 1   -- +1 for newline
      in off : go o' ls

-- | Chunk a single line into fixed-width pieces.
chunkFixed :: Int -> Text -> [Text]
chunkFixed w t
  | w <= 0    = []
  | T.null t  = [""]
  | otherwise =
      let (h, rest) = T.splitAt w t
      in h : if T.null rest then [] else chunkFixed w rest

--------------------------------------------------------------------------------
-- Hyphenation-aware segmenting (TeX-lite)
--------------------------------------------------------------------------------

-- Compute offsets starting from a base offset
scanOffsetsFrom :: Int -> [Text] -> [Int]
scanOffsetsFrom start = go start
  where
    go !_ []     = []
    go !o (t:ts) = o : go (o + T.length t) ts

segmentWithHyphenationTeXLite
  :: H.Hyphenator
  -> Int
  -> Text
  -> HM.HashMap Text [(Text, Text)]
  -> ([Text], HM.HashMap Text [(Text, Text)])
segmentWithHyphenationTeXLite dict width txt cache0 =
    go cache0 (T.words txt)
  where
    go cache [] = ([], cache)
    go cache ws =
      case lineCandidates dict width cache ws of
        ([], cache1) ->
          let chunks = breakWordSafe width ws
          in (chunks, cache1)
        (cs, cache1) ->
          let (line, rest, _) =
                minimumBy (comparing (scoreCandidate width)) cs
              (more, cache2) = go cache1 rest
          in (line : more, cache2)

-- | Lossless fallback: treat remaining words as one long text and
--   chunk it into width-sized pieces. Never truncates, never drops text.
breakWordSafe :: Int -> [Text] -> [Text]
breakWordSafe width ws =
  chunk (T.unwords ws)
  where
    chunk t
      | T.null t            = []
      | T.length t <= width = [t]
      | otherwise =
          let (c, r) = T.splitAt width t
          in c : chunk r

type Candidate = (Text, [Text], Bool)

lineCandidates
  :: H.Hyphenator
  -> Int
  -> HM.HashMap Text [(Text, Text)]
  -> [Text]
  -> ([(Text, [Text], Bool)], HM.HashMap Text [(Text, Text)])
lineCandidates dict width cache0 ws0 =
  go [] [] cache0 ws0
  where
    go _    acc cache [] = (acc, cache)
    go line acc cache (w:ws) =
      let space   = if null line then "" else " "
          baseTxt = T.unwords line
          wholeTxt = baseTxt <> space <> w
          wholeLen = T.length wholeTxt

          acc1 =
            if wholeLen <= width && not (T.null wholeTxt)
              then (wholeTxt, ws, False) : acc
              else acc

          (hyphs, cache1) =
            case HM.lookup w cache of
              Just hs -> (hs, cache)
              Nothing ->
                let hs = hyphenateWord dict w
                in (hs, HM.insert w hs cache)

          hyphCands =
            [ (preTxt, suf : ws, True)
            | (pre, suf) <- hyphs
            , not (T.null pre)
            , let preTxt = baseTxt <> space <> pre <> "-"
            , T.length preTxt <= width
            ]

          acc2 = hyphCands ++ acc1
      in if wholeLen <= width
           then go (line ++ [w]) acc2 cache1 ws
           else (acc2, cache1)

hyphenateWord :: H.Hyphenator -> Text -> [(Text, Text)]
hyphenateWord dict word =
    let parts = H.hyphenate dict (T.unpack word)
    in [ ( T.pack (concat (take i parts))
         , T.pack (concat (drop i parts))
         )
       | i <- [1 .. length parts - 1]
       ]

scoreCandidate :: Int -> Candidate -> Int
scoreCandidate width (line, _, endsWithHyphen) =
    let len      = T.length line
        remSpace = max 0 (width - len)
        badness  = remSpace * remSpace * remSpace
        hyphenPenalty =
          if endsWithHyphen then 50 else 0
        shortPenalty =
          if len < width `div` 2 then 200 else 0
    in badness + hyphenPenalty + shortPenalty