diff options
Diffstat (limited to 'src/TextViewport/Render/Segmentation.hs')
| -rw-r--r-- | src/TextViewport/Render/Segmentation.hs | 200 |
1 files changed, 200 insertions, 0 deletions
diff --git a/src/TextViewport/Render/Segmentation.hs b/src/TextViewport/Render/Segmentation.hs new file mode 100644 index 0000000..584798e --- /dev/null +++ b/src/TextViewport/Render/Segmentation.hs @@ -0,0 +1,200 @@ +module TextViewport.Render.Segmentation where + +import Data.DList qualified as DL +import Data.HashMap.Strict qualified as HM +import Data.List (minimumBy) +import Data.Ord (comparing) +import Data.Text (Text) +import Data.Text qualified as T +import Data.Vector (Vector) +import Data.Vector qualified as V +import Text.Hyphenation qualified as H +import TextViewport.Buffer.Item +import TextViewport.Render.RenderedLine (RenderedLine(..)) -- TODO qualified + +applyStrategy :: SegmentStrategy -> Int -> Int -> Text -> Vector RenderedLine + +applyStrategy NoSegments width itemIx txt = + let rawLines = T.splitOn "\n" txt + chunks = map (T.take width) rawLines -- crop + offsets = scanOffsetsWithNewlines chunks + in V.fromList + [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } + | (lineIx, (off, chunk)) <- zip [0..] (zip offsets chunks) + ] + +applyStrategy FixedWidthSegments width itemIx txt = + let rawLines = T.splitOn "\n" txt + (dl, _) = foldl step (DL.empty, 0) rawLines + step (acc, off0) line = + let chunks = chunkFixed width line + offsets = scanOffsetsFrom off0 chunks + offNext = off0 + T.length line + 1 + acc' = acc `DL.append` DL.fromList (zip offsets chunks) + in (acc', offNext) + allChunks = DL.toList dl + in V.fromList + [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } + | (lineIx, (off, chunk)) <- zip [0..] allChunks + ] + +applyStrategy (HyphenateSegments dict cache0) width itemIx txt = + let rawLines = T.splitOn "\n" txt + + -- fold over each physical line, accumulating: + -- * all rendered (offset, chunk) pairs + -- * updated hyphenation cache (unused for now) + -- * running character offset across lines + (dl, _cache1, _) = + foldl segmentOneLine (DL.empty, cache0, 0) rawLines + allChunks = DL.toList dl + in V.fromList + [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } + | (lineIx, (off, chunk)) <- zip [0..] allChunks + ] + where + -- Segment a single physical line using TeX‑lite hyphenation + --segmentOneLine + -- :: ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int) + -- -> Text + -- -> ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int) + segmentOneLine (acc, cache, off0) line = + let (chunks, cache1) = segmentWithHyphenationTeXLite dict width line cache + offsets = scanOffsetsFrom off0 chunks + offNext = off0 + T.length line + 1 + acc' = acc `DL.append` DL.fromList (zip offsets chunks) + in (acc', cache1, offNext) + --segmentOneLine (acc, cache, off0) line = + -- let chunks = segmentWithHyphenationTeXLite dict width line + -- offsets = scanOffsetsFrom off0 chunks + -- offNext = off0 + T.length line + 1 -- +1 for newline + -- acc' = acc ++ zip offsets chunks + -- in (acc', cache, offNext) + +-- | Compute running character offsets for a list of chunks. +scanOffsetsWithNewlines :: [Text] -> [Int] +scanOffsetsWithNewlines = go 0 + where + go !_ [] = [] + go !o (l:ls) = + let off = o + o' = o + T.length l + 1 -- +1 for newline + in off : go o' ls + +-- | Chunk a single line into fixed-width pieces. +chunkFixed :: Int -> Text -> [Text] +chunkFixed w t + | w <= 0 = [] + | T.null t = [""] + | otherwise = + let (h, rest) = T.splitAt w t + in h : if T.null rest then [] else chunkFixed w rest + +-------------------------------------------------------------------------------- +-- Hyphenation-aware segmenting (TeX-lite) +-------------------------------------------------------------------------------- + +-- Compute offsets starting from a base offset +scanOffsetsFrom :: Int -> [Text] -> [Int] +scanOffsetsFrom start = go start + where + go !_ [] = [] + go !o (t:ts) = o : go (o + T.length t) ts + +segmentWithHyphenationTeXLite + :: H.Hyphenator + -> Int + -> Text + -> HM.HashMap Text [(Text, Text)] + -> ([Text], HM.HashMap Text [(Text, Text)]) +segmentWithHyphenationTeXLite dict width txt cache0 = + go cache0 (T.words txt) + where + go cache [] = ([], cache) + go cache ws = + case lineCandidates dict width cache ws of + ([], cache1) -> + let chunks = breakWordSafe width ws + in (chunks, cache1) + (cs, cache1) -> + let (line, rest, _) = + minimumBy (comparing (scoreCandidate width)) cs + (more, cache2) = go cache1 rest + in (line : more, cache2) + +-- | Lossless fallback: treat remaining words as one long text and +-- chunk it into width-sized pieces. Never truncates, never drops text. +breakWordSafe :: Int -> [Text] -> [Text] +breakWordSafe width ws = + chunk (T.unwords ws) + where + chunk t + | T.null t = [] + | T.length t <= width = [t] + | otherwise = + let (c, r) = T.splitAt width t + in c : chunk r + +type Candidate = (Text, [Text], Bool) + +lineCandidates + :: H.Hyphenator + -> Int + -> HM.HashMap Text [(Text, Text)] + -> [Text] + -> ([(Text, [Text], Bool)], HM.HashMap Text [(Text, Text)]) +lineCandidates dict width cache0 ws0 = + go [] [] cache0 ws0 + where + go _ acc cache [] = (acc, cache) + go line acc cache (w:ws) = + let space = if null line then "" else " " + baseTxt = T.unwords line + wholeTxt = baseTxt <> space <> w + wholeLen = T.length wholeTxt + + acc1 = + if wholeLen <= width && not (T.null wholeTxt) + then (wholeTxt, ws, False) : acc + else acc + + (hyphs, cache1) = + case HM.lookup w cache of + Just hs -> (hs, cache) + Nothing -> + let hs = hyphenateWord dict w + in (hs, HM.insert w hs cache) + + hyphCands = + [ (preTxt, suf : ws, True) + | (pre, suf) <- hyphs + , not (T.null pre) + , let preTxt = baseTxt <> space <> pre <> "-" + , T.length preTxt <= width + ] + + acc2 = hyphCands ++ acc1 + in if wholeLen <= width + then go (line ++ [w]) acc2 cache1 ws + else (acc2, cache1) + +hyphenateWord :: H.Hyphenator -> Text -> [(Text, Text)] +hyphenateWord dict word = + let parts = H.hyphenate dict (T.unpack word) + in [ ( T.pack (concat (take i parts)) + , T.pack (concat (drop i parts)) + ) + | i <- [1 .. length parts - 1] + ] + +scoreCandidate :: Int -> Candidate -> Int +scoreCandidate width (line, _, endsWithHyphen) = + let len = T.length line + remSpace = max 0 (width - len) + badness = remSpace * remSpace * remSpace + hyphenPenalty = + if endsWithHyphen then 50 else 0 + shortPenalty = + if len < width `div` 2 then 200 else 0 + in badness + hyphenPenalty + shortPenalty + |
