module TextViewport.Render.Segmentation where import Data.DList qualified as DL import Data.HashMap.Strict qualified as HM import Data.List (minimumBy) import Data.Ord (comparing) import Data.Text (Text) import Data.Text qualified as T import Data.Vector (Vector) import Data.Vector qualified as V import Text.Hyphenation qualified as H import TextViewport.Buffer.Item import TextViewport.Render.RenderedLine (RenderedLine(..)) -- TODO qualified applyStrategy :: SegmentStrategy -> Int -> Int -> Text -> Vector RenderedLine applyStrategy NoSegments width itemIx txt = let rawLines = T.splitOn "\n" txt chunks = map (T.take width) rawLines -- crop offsets = scanOffsetsWithNewlines chunks in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] (zip offsets chunks) ] applyStrategy FixedWidthSegments width itemIx txt = let rawLines = T.splitOn "\n" txt (dl, _) = foldl step (DL.empty, 0) rawLines step (acc, off0) line = let chunks = chunkFixed width line offsets = scanOffsetsFrom off0 chunks offNext = off0 + T.length line + 1 acc' = acc `DL.append` DL.fromList (zip offsets chunks) in (acc', offNext) allChunks = DL.toList dl in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] allChunks ] applyStrategy (HyphenateSegments dict cache0) width itemIx txt = let rawLines = T.splitOn "\n" txt -- fold over each physical line, accumulating: -- * all rendered (offset, chunk) pairs -- * updated hyphenation cache (unused for now) -- * running character offset across lines (dl, _cache1, _) = foldl segmentOneLine (DL.empty, cache0, 0) rawLines allChunks = DL.toList dl in V.fromList [ RenderedLine { rlText = chunk, rlItemIx = itemIx, rlLineIx = lineIx, rlCharStart = off } | (lineIx, (off, chunk)) <- zip [0..] allChunks ] where -- Segment a single physical line using TeX‑lite hyphenation --segmentOneLine -- :: ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int) -- -> Text -- -> ([(Int, Text)], HM.HashMap Text [(Text, Text)], Int) segmentOneLine (acc, cache, off0) line = let (chunks, cache1) = segmentWithHyphenationTeXLite dict width line cache offsets = scanOffsetsFrom off0 chunks offNext = off0 + T.length line + 1 acc' = acc `DL.append` DL.fromList (zip offsets chunks) in (acc', cache1, offNext) --segmentOneLine (acc, cache, off0) line = -- let chunks = segmentWithHyphenationTeXLite dict width line -- offsets = scanOffsetsFrom off0 chunks -- offNext = off0 + T.length line + 1 -- +1 for newline -- acc' = acc ++ zip offsets chunks -- in (acc', cache, offNext) -- | Compute running character offsets for a list of chunks. scanOffsetsWithNewlines :: [Text] -> [Int] scanOffsetsWithNewlines = go 0 where go !_ [] = [] go !o (l:ls) = let off = o o' = o + T.length l + 1 -- +1 for newline in off : go o' ls -- | Chunk a single line into fixed-width pieces. chunkFixed :: Int -> Text -> [Text] chunkFixed w t | w <= 0 = [] | T.null t = [""] | otherwise = let (h, rest) = T.splitAt w t in h : if T.null rest then [] else chunkFixed w rest -------------------------------------------------------------------------------- -- Hyphenation-aware segmenting (TeX-lite) -------------------------------------------------------------------------------- -- Compute offsets starting from a base offset scanOffsetsFrom :: Int -> [Text] -> [Int] scanOffsetsFrom start = go start where go !_ [] = [] go !o (t:ts) = o : go (o + T.length t) ts segmentWithHyphenationTeXLite :: H.Hyphenator -> Int -> Text -> HM.HashMap Text [(Text, Text)] -> ([Text], HM.HashMap Text [(Text, Text)]) segmentWithHyphenationTeXLite dict width txt cache0 = go cache0 (T.words txt) where go cache [] = ([], cache) go cache ws = case lineCandidates dict width cache ws of ([], cache1) -> let chunks = breakWordSafe width ws in (chunks, cache1) (cs, cache1) -> let (line, rest, _) = minimumBy (comparing (scoreCandidate width)) cs (more, cache2) = go cache1 rest in (line : more, cache2) -- | Lossless fallback: treat remaining words as one long text and -- chunk it into width-sized pieces. Never truncates, never drops text. breakWordSafe :: Int -> [Text] -> [Text] breakWordSafe width ws = chunk (T.unwords ws) where chunk t | T.null t = [] | T.length t <= width = [t] | otherwise = let (c, r) = T.splitAt width t in c : chunk r type Candidate = (Text, [Text], Bool) lineCandidates :: H.Hyphenator -> Int -> HM.HashMap Text [(Text, Text)] -> [Text] -> ([(Text, [Text], Bool)], HM.HashMap Text [(Text, Text)]) lineCandidates dict width cache0 ws0 = go [] [] cache0 ws0 where go _ acc cache [] = (acc, cache) go line acc cache (w:ws) = let space = if null line then "" else " " baseTxt = T.unwords line wholeTxt = baseTxt <> space <> w wholeLen = T.length wholeTxt acc1 = if wholeLen <= width && not (T.null wholeTxt) then (wholeTxt, ws, False) : acc else acc (hyphs, cache1) = case HM.lookup w cache of Just hs -> (hs, cache) Nothing -> let hs = hyphenateWord dict w in (hs, HM.insert w hs cache) hyphCands = [ (preTxt, suf : ws, True) | (pre, suf) <- hyphs , not (T.null pre) , let preTxt = baseTxt <> space <> pre <> "-" , T.length preTxt <= width ] acc2 = hyphCands ++ acc1 in if wholeLen <= width then go (line ++ [w]) acc2 cache1 ws else (acc2, cache1) hyphenateWord :: H.Hyphenator -> Text -> [(Text, Text)] hyphenateWord dict word = let parts = H.hyphenate dict (T.unpack word) in [ ( T.pack (concat (take i parts)) , T.pack (concat (drop i parts)) ) | i <- [1 .. length parts - 1] ] scoreCandidate :: Int -> Candidate -> Int scoreCandidate width (line, _, endsWithHyphen) = let len = T.length line remSpace = max 0 (width - len) badness = remSpace * remSpace * remSpace hyphenPenalty = if endsWithHyphen then 50 else 0 shortPenalty = if len < width `div` 2 then 200 else 0 in badness + hyphenPenalty + shortPenalty