summaryrefslogtreecommitdiffstats
path: root/src/TextViewport/Render/Segmentation.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/TextViewport/Render/Segmentation.hs')
-rw-r--r--src/TextViewport/Render/Segmentation.hs200
1 files changed, 200 insertions, 0 deletions
diff --git a/src/TextViewport/Render/Segmentation.hs b/src/TextViewport/Render/Segmentation.hs
new file mode 100644
index 0000000..584798e
--- /dev/null
+++ b/src/TextViewport/Render/Segmentation.hs
@@ -0,0 +1,200 @@
+module TextViewport.Render.Segmentation where
+
+import Data.DList qualified as DL
+import Data.HashMap.Strict qualified as HM
+import Data.List (minimumBy)
+import Data.Ord (comparing)
+import Data.Text (Text)
+import Data.Text qualified as T
+import Data.Vector (Vector)
+import Data.Vector qualified as V
+import Text.Hyphenation qualified as H
+import TextViewport.Buffer.Item
+import TextViewport.Render.RenderedLine (RenderedLine(..)) -- TODO qualified
+
+applyStrategy :: SegmentStrategy -> Int -> Int -> Text -> Vector RenderedLine
+
+applyStrategy NoSegments width itemIx txt =
+ let rawLines = T.splitOn "\n" txt
+ chunks = map (T.take width) rawLines -- crop
+ offsets = scanOffsetsWithNewlines chunks
+ in V.fromList
+ [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
+ | (lineIx, (off, chunk)) <- zip [0..] (zip offsets chunks)
+ ]
+
+applyStrategy FixedWidthSegments width itemIx txt =
+ let rawLines = T.splitOn "\n" txt
+ (dl, _) = foldl step (DL.empty, 0) rawLines
+ step (acc, off0) line =
+ let chunks = chunkFixed width line
+ offsets = scanOffsetsFrom off0 chunks
+ offNext = off0 + T.length line + 1
+ acc' = acc `DL.append` DL.fromList (zip offsets chunks)
+ in (acc', offNext)
+ allChunks = DL.toList dl
+ in V.fromList
+ [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
+ | (lineIx, (off, chunk)) <- zip [0..] allChunks
+ ]
+
+applyStrategy (HyphenateSegments dict cache0) width itemIx txt =
+ let rawLines = T.splitOn "\n" txt
+
+ -- fold over each physical line, accumulating:
+ -- * all rendered (offset, chunk) pairs
+ -- * updated hyphenation cache (unused for now)
+ -- * running character offset across lines
+ (dl, _cache1, _) =
+ foldl segmentOneLine (DL.empty, cache0, 0) rawLines
+ allChunks = DL.toList dl
+ in V.fromList
+ [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off }
+ | (lineIx, (off, chunk)) <- zip [0..] allChunks
+ ]
+ where
+ -- Segment a single physical line using TeX‑lite hyphenation
+ --segmentOneLine
+ -- :: ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int)
+ -- -> Text
+ -- -> ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int)
+ segmentOneLine (acc, cache, off0) line =
+ let (chunks, cache1) = segmentWithHyphenationTeXLite dict width line cache
+ offsets = scanOffsetsFrom off0 chunks
+ offNext = off0 + T.length line + 1
+ acc' = acc `DL.append` DL.fromList (zip offsets chunks)
+ in (acc', cache1, offNext)
+ --segmentOneLine (acc, cache, off0) line =
+ -- let chunks = segmentWithHyphenationTeXLite dict width line
+ -- offsets = scanOffsetsFrom off0 chunks
+ -- offNext = off0 + T.length line + 1 -- +1 for newline
+ -- acc' = acc ++ zip offsets chunks
+ -- in (acc', cache, offNext)
+
+-- | Compute running character offsets for a list of chunks.
+scanOffsetsWithNewlines :: [Text] -> [Int]
+scanOffsetsWithNewlines = go 0
+ where
+ go !_ [] = []
+ go !o (l:ls) =
+ let off = o
+ o' = o + T.length l + 1 -- +1 for newline
+ in off : go o' ls
+
+-- | Chunk a single line into fixed-width pieces.
+chunkFixed :: Int -> Text -> [Text]
+chunkFixed w t
+ | w <= 0 = []
+ | T.null t = [""]
+ | otherwise =
+ let (h, rest) = T.splitAt w t
+ in h : if T.null rest then [] else chunkFixed w rest
+
+--------------------------------------------------------------------------------
+-- Hyphenation-aware segmenting (TeX-lite)
+--------------------------------------------------------------------------------
+
+-- Compute offsets starting from a base offset
+scanOffsetsFrom :: Int -> [Text] -> [Int]
+scanOffsetsFrom start = go start
+ where
+ go !_ [] = []
+ go !o (t:ts) = o : go (o + T.length t) ts
+
+segmentWithHyphenationTeXLite
+ :: H.Hyphenator
+ -> Int
+ -> Text
+ -> HM.HashMap Text [(Text, Text)]
+ -> ([Text], HM.HashMap Text [(Text, Text)])
+segmentWithHyphenationTeXLite dict width txt cache0 =
+ go cache0 (T.words txt)
+ where
+ go cache [] = ([], cache)
+ go cache ws =
+ case lineCandidates dict width cache ws of
+ ([], cache1) ->
+ let chunks = breakWordSafe width ws
+ in (chunks, cache1)
+ (cs, cache1) ->
+ let (line, rest, _) =
+ minimumBy (comparing (scoreCandidate width)) cs
+ (more, cache2) = go cache1 rest
+ in (line : more, cache2)
+
+-- | Lossless fallback: treat remaining words as one long text and
+-- chunk it into width-sized pieces. Never truncates, never drops text.
+breakWordSafe :: Int -> [Text] -> [Text]
+breakWordSafe width ws =
+ chunk (T.unwords ws)
+ where
+ chunk t
+ | T.null t = []
+ | T.length t <= width = [t]
+ | otherwise =
+ let (c, r) = T.splitAt width t
+ in c : chunk r
+
+type Candidate = (Text, [Text], Bool)
+
+lineCandidates
+ :: H.Hyphenator
+ -> Int
+ -> HM.HashMap Text [(Text, Text)]
+ -> [Text]
+ -> ([(Text, [Text], Bool)], HM.HashMap Text [(Text, Text)])
+lineCandidates dict width cache0 ws0 =
+ go [] [] cache0 ws0
+ where
+ go _ acc cache [] = (acc, cache)
+ go line acc cache (w:ws) =
+ let space = if null line then "" else " "
+ baseTxt = T.unwords line
+ wholeTxt = baseTxt <> space <> w
+ wholeLen = T.length wholeTxt
+
+ acc1 =
+ if wholeLen <= width && not (T.null wholeTxt)
+ then (wholeTxt, ws, False) : acc
+ else acc
+
+ (hyphs, cache1) =
+ case HM.lookup w cache of
+ Just hs -> (hs, cache)
+ Nothing ->
+ let hs = hyphenateWord dict w
+ in (hs, HM.insert w hs cache)
+
+ hyphCands =
+ [ (preTxt, suf : ws, True)
+ | (pre, suf) <- hyphs
+ , not (T.null pre)
+ , let preTxt = baseTxt <> space <> pre <> "-"
+ , T.length preTxt <= width
+ ]
+
+ acc2 = hyphCands ++ acc1
+ in if wholeLen <= width
+ then go (line ++ [w]) acc2 cache1 ws
+ else (acc2, cache1)
+
+hyphenateWord :: H.Hyphenator -> Text -> [(Text, Text)]
+hyphenateWord dict word =
+ let parts = H.hyphenate dict (T.unpack word)
+ in [ ( T.pack (concat (take i parts))
+ , T.pack (concat (drop i parts))
+ )
+ | i <- [1 .. length parts - 1]
+ ]
+
+scoreCandidate :: Int -> Candidate -> Int
+scoreCandidate width (line, _, endsWithHyphen) =
+ let len = T.length line
+ remSpace = max 0 (width - len)
+ badness = remSpace * remSpace * remSpace
+ hyphenPenalty =
+ if endsWithHyphen then 50 else 0
+ shortPenalty =
+ if len < width `div` 2 then 200 else 0
+ in badness + hyphenPenalty + shortPenalty
+