module Main (main) where

import System.IO (readFile, writeFile)
import qualified Data.ByteString as L
-- import qualified Data.ByteString.UTF8 as L8
-- import Control.Monad.State
import qualified Data.Map as M
import Data.List
import Data.Word

import Unicode (word32to8le, utf8to16'le)


data Tree = Node {
                tValue  :: [L.ByteString], {- of UTF-16 ! -}
                tLeaves :: M.Map Char Tree
            }
            deriving (Show)



{-
utf8(无)=[0xE6,0x97,0xA0] , the standard Library:
#################################################
#     Data.Char.isSpace '\xA0' == True !!!!!!   #
#################################################
so the Prelude.words function will break the string between 0x97 and 0xA0
is it a BUG?
so i write my own
-}

words' :: String -> [String]
words' s = case dropWhile isSpace' s of
                [] -> []
                s' -> let (w, s'') = break isSpace' s'
                      in w : words' s''
        where isSpace' '\t' = True
              isSpace' ' '  = True
              isSpace' '\n' = True
              isSpace' '\r' = True
              isSpace'  _   = False

filterUTF8BOM :: String -> String
filterUTF8BOM ('\xEF':'\xBB':'\xBF':s) = s
filterUTF8BOM s = s

main = readFile "table.txt" >>=
       L.writeFile "table.db" . parseTree . buildTree
                              . map paar . breakText . filterUTF8BOM
       where -- by making tuples , translating utf8 into utf16
             {- i don't have ByteString.UTF8 in my Windows installed :( -}
             {- paar (a:b:_) = (a, utf8to16le $ L8.fromString b) -}
             paar (a:b:_) = (a, L.pack $ utf8to16'le $ map (toEnum.fromEnum) b)
             -- break Text into `[[key, val, freq]]`
             breakText = filter (\x-> length x >= 2) . map words' . lines


buildTree :: [(String, L.ByteString)] -> Tree
buildTree table = foldl' insertNode root table
          where root = Node [] M.empty


insertNode :: Tree -> (String, L.ByteString) -> Tree
{- ein neues Ergebnis hinzufügen , O(1), aber invers: -}
insertNode root ([], val) = root { tValue = val : tValue root }
insertNode root ((k:ks), val) =
        root { tLeaves = M.alter f k $ tLeaves root }
        where emptyNode = Node [] M.empty
              -- insert or update a Node
              f Nothing     = Just $ insertNode emptyNode (ks, val)
              f (Just leaf) = Just $ insertNode leaf (ks, val)


parseTree :: Tree -> L.ByteString
parseTree root = flip L.append leaves $ fromWord32 $
                               4 + (toEnum $ L.length leaves)
        where leaves = M.foldlWithKey foldLeaves L.empty $ tLeaves root
              foldLeaves t k v = L.append t $ parseTree' k v

{- did not use StateT + WriterMonad, because it's slower and the
 - output String in WriterMonad can not be directly manipulated -}
{- Since L.length is O(n), so do not need to
 - record the length of ByteString manuell -}

parseTree' :: Char -> Tree -> L.ByteString
parseTree' k root = foldl' L.append L.empty $ binNode
    where binNode = [
                        fromWord32 totalSize,
                        fromChar k,
                        fromWord32 strSize, --
                        strings, --
                        leaves
                    ]
          totalSize = 4 + 2 + strSize+leafSize  {- 4 : Word32, 2 : WChar (k) -}
          strings = foldl' foldStrs L.empty $ tValue root
          strSize = 4 + (toEnum $ L.length strings)
          leaves = M.foldlWithKey foldLeaves L.empty $ tLeaves root
          -- we do not record the `leafSize` field in the final binary Data
          -- so do not +4 to leafSize it self !!
          leafSize = toEnum $ L.length leaves
          {- foldStrs :: L.ByteString -> L.ByteString -> Word32 -}
          foldStrs a x = L.append (L.append x $ L.pack [0,0]) a
          {- foldStrs :: Char -> L.ByteString -> L.ByteString -> L.ByteString -}
          foldLeaves t k v = L.append t $ parseTree' k v


fromWord32 :: Word32 -> L.ByteString
fromWord32 = L.pack . word32to8le

{- convert Char in WChar -}
fromChar :: Char -> L.ByteString
fromChar x = L.pack [toEnum $ fromEnum x, 0]
