Darcs inventory parser: Add hash types and parse previous inventory

2016-05-16 14:02:43 +00:00 · 2016-05-16 14:02:43 +00:00 · 9ba6761459
commit 9ba6761459
parent f8ae122da7
4 changed files with 164 additions and 93 deletions
--- a/src/Darcs/Local/PatchInfo/Parser.hs
+++ b/src/Darcs/Local/PatchInfo/Parser.hs
@ -27,12 +27,12 @@ module Darcs.Local.PatchInfo.Parser
    )
 where

-import Prelude hiding (takeWhile)
+import Prelude hiding (take, takeWhile)

-import Control.Applicative (many)
-import Control.Monad (replicateM, replicateM_)
+import Control.Applicative (many, optional, liftA2)
+import Control.Arrow (second)
+import Control.Monad (replicateM_)
 import Crypto.Hash
-import Crypto.Hash.Algorithms (SHA1 (SHA1))
 import Data.Attoparsec.ByteString
 import Data.ByteArray (convert)
 import Data.ByteString (ByteString)
@ -43,7 +43,8 @@ import System.FilePath ((</>))

 import qualified Data.ByteString as B
 import qualified Data.ByteString.Char8 as BC
-import qualified Storage.Hashed.Hash as H
+import qualified Data.ByteString.Base16 as B16
+import qualified Data.ByteString.Lex.Integral as BX

 import Control.Applicative.Local
 import Darcs.Local.PatchInfo.Types
@ -51,15 +52,22 @@ import Data.Attoparsec.ByteString.Local
 import Data.ByteString.Local (stripPrefix)
 import Data.Text.UTF8.Local (decodeStrict)

-lf, space, star, dash, zero, nine, sqrOpen, sqrClose :: Word8
+lf       :: Word8
 lf       = 10
+space    :: Word8
 space    = 32
+star     :: Word8
 star     = 42
+dash     :: Word8
 dash     = 45
+zero     :: Word8
 zero     = 48
+nine     :: Word8
 nine     = 57
+sqrOpen  :: Word8
 sqrOpen  = 91
-sqrClose = 93
+--sqrClose :: Word8
+--sqrClose = 93

 digit :: Parser Word8
 digit = satisfy $ \ w -> zero <= w && w <= nine
@ -87,67 +95,92 @@ patchTimeP = do
    month <- decimal2P
    day <- decimal2P

-    hour <- decimal2P
-    minute <- decimal2P
-    second <- decimal2P
+    hours <- decimal2P
+    minutes <- decimal2P
+    seconds <- decimal2P

    case fromGregorianValid year month day of
        Nothing -> fail "Invalid patch date"
        Just uday -> return UTCTime
            { utctDay     = uday
            , utctDayTime =
-                secondsToDiffTime $ 3600 * hour + 60 * minute + second
+                secondsToDiffTime $ 3600 * hours + 60 * minutes + seconds
            }

+line :: Parser ByteString
+line = restOfLine
+
+restOfLine :: Parser ByteString
+restOfLine = takeWhile (/= lf)
+
+eol :: Parser ()
+eol = skip (== lf)
+
 skipLine :: Parser ()
-skipLine = do
-    skipWhile (/= lf)
-    skip (== lf)
+skipLine = skipWhile (/= lf)

 skipRestOfLine :: Parser ()
 skipRestOfLine = skipLine

 skipPatchP :: Parser ()
-skipPatchP = do
+skipPatchP =
    -- title
-    skipLine
+    skipLine *> eol *>
    -- author, inverted, time
-    skipLine
+    skipLine *> eol *>
    -- ignore, description
-    skipMany $ do
-        skip (== space)
-        skipRestOfLine
+    (skipMany $ skip (== space) *> skipRestOfLine *> eol) *>
    -- end of info
-    string "] \n"
+    (string "] \n") *>
    -- hash
-    skipWhile (/= lf)
+    skipLine

-pristineP :: Parser (Maybe Int, H.Hash)
-pristineP = do
-    string "pristine:"
-    (,) Nothing . H.decodeBase16 <$> takeWhile (/= lf)
+sha256P :: Parser ByteString
+sha256P = do
+    bs <- take 64
+    case second B.null $ B16.decode bs of
+        (h, True) -> return h
+        _         -> fail "SHA256 decoding from hex failed"
+
+sizeP :: Parser Int
+sizeP = do
+    bs <- take 10
+    case second B.null <$> BX.readDecimal bs of
+        Just (n, True) -> return n
+        _              -> fail "sizeP failed"
+
+sizeSha256P :: Parser (Int, ByteString)
+sizeSha256P = liftA2 (,) sizeP (skip (== dash) *> sha256P)
+
+pristineP :: Parser PristineHash
+pristineP = string "pristine:" *> (PristineHash <$> sha256P)
+
+prevInvP :: Parser InventoryHash
+prevInvP =
+    string "Starting with inventory" *> eol *>
+    (uncurry InventoryHash <$> sizeSha256P)

 patchInfoRawP :: Parser PatchInfoRaw
 patchInfoRawP = do
    word8 sqrOpen
    title <- takeWhile1 (/= lf)
-    word8 lf
+    eol

    author <- takeWhile1 (/= star)
    word8 star
    inverted <- (/= star) <$> (satisfy $ \ c -> c == star || c == dash)
    (timeRaw, time) <- match patchTimeP
-    word8 lf
+    eol

    word8 space
    junkp <- string "Ignore-this: "
    junkc <- takeWhile1 (/= lf)
-    word8 lf
+    eol

-    lines <- many $ word8 space *> takeWhile (/= lf) <* word8 lf
+    lines <- many $ word8 space *> takeWhile (/= lf) <* eol
    string "] \nhash: "

-    hash <- takeWhile1 (/= lf)
+    hash <- sizeSha256P

    return PatchInfoRaw
        { pirAuthor      = author
@ -185,7 +218,7 @@ refinePatchInfo pir =
            l -> Just $ BC.unlines l
    in  PatchInfo
            { piAuthor      = decodeStrict $ pirAuthor pir
-            , piHash        = pirHash pir
+            , piHash        = uncurry ContentHash $ pirHash pir
            , piTitle       = decodeStrict title
            , piDescription = decodeStrict <$> description
            , piTag         = tag
@ -194,71 +227,54 @@ refinePatchInfo pir =

 -- | Parse patch metadata and compute the metadata's hash, which can be used as
 -- a patch identifier for lookup and matching.
-patchInfoP :: Parser (PatchInfo, ByteString)
+patchInfoP :: Parser (PatchInfo, PatchHash)
 patchInfoP = do
    pir <- patchInfoRawP
-    return (refinePatchInfo pir, convert $ hashPatchInfo SHA1 pir)
-
-- NEXT current plan:
--
-- (0) all parses are incremental!!!
-- (1) use CountP below to determine number of patches
-- (2) given pagination details determine offset and limit
-- (3) parse the patches we want
-- (4) reverse their order...
-- (5) generate a whamlet widget from that
-- (*) FOR NOW no pagination, just read the entire thing once...
+    return (refinePatchInfo pir, PatchHash $ convert $ hashPatchInfo SHA1 pir)

 patchInfosCountP :: Parser Int
 patchInfosCountP = do
-    skipWhile (/= lf) -- pristine hash
-    n <- length <$> (many $ word8 lf >> skipPatchP)
-    word8 lf
+    -- pristine hash
+    skipLine
+    -- previous inventory
+    optional $
+        eol *> string "Starting" *> skipRestOfLine *> eol *>
+        skipLine
+    -- patch info
+    n <- length <$> (many $ eol *> skipPatchP)
+    eol
    return n

 patchInfosAllP :: Parser PatchSeq
-patchInfosAllP = do
-    (psize, phash) <- pristineP
-    ps <- many $ word8 lf >> patchInfoP
-    word8 lf
-    return PatchSeq
-        { psPristineHash = phash
-        , psPristineSize = psize
-        , psPatches      = ps
-        }
+patchInfosAllP = PatchSeq
+    <$> pristineP
+    <*> optional (eol *> prevInvP)
+    <*> many (eol *> patchInfoP)
+    <*  eol

 patchInfosOffsetP :: Int -> Parser PatchSeq
-patchInfosOffsetP off = do
-    (psize, phash) <- pristineP
-    replicateM_ off $ word8 lf >> skipPatchP
-    ps <- many $ word8 lf >> patchInfoP
-    word8 lf
-    return PatchSeq
-        { psPristineHash = phash
-        , psPristineSize = psize
-        , psPatches      = ps
-        }
+patchInfosOffsetP off = PatchSeq
+    <$> pristineP
+    <*> optional (eol *> prevInvP)
+    <*> ( replicateM_ off (eol *> skipPatchP) *>
+          many (eol *> patchInfoP)
+        )
+    <* eol
+

 patchInfosLimitP :: Int -> Parser PatchSeq
-patchInfosLimitP lim = do
-    (psize, phash) <- pristineP
-    ps <- atMost lim $ word8 lf >> patchInfoP
-    return PatchSeq
-        { psPristineHash = phash
-        , psPristineSize = psize
-        , psPatches      = ps
-        }
+patchInfosLimitP lim = PatchSeq
+    <$> pristineP
+    <*> optional (eol *> prevInvP)
+    <*> atMost lim (eol *> patchInfoP)

 patchInfosOffsetLimitP :: Int -> Int -> Parser PatchSeq
-patchInfosOffsetLimitP off lim = do
-    (psize, phash) <- pristineP
-    replicateM_ off $ word8 lf >> skipPatchP
-    ps <- atMost lim $ word8 lf >> patchInfoP
-    return PatchSeq
-        { psPristineHash = phash
-        , psPristineSize = psize
-        , psPatches      = ps
-        }
+patchInfosOffsetLimitP off lim = PatchSeq
+    <$> pristineP
+    <*> optional (eol *> prevInvP)
+    <*> ( replicateM_ off (eol *> skipPatchP) *>
+          atMost lim (eol *> patchInfoP)
+        )

 darcsDir :: FilePath
 darcsDir = "_darcs"
--- a/src/Darcs/Local/PatchInfo/Types.hs
+++ b/src/Darcs/Local/PatchInfo/Types.hs
@ -15,23 +15,54 @@

 module Darcs.Local.PatchInfo.Types
    ( PatchInfoRaw (..)
+    , PatchHash (..)
+    , ContentHash (..)
+    , InventoryHash (..)
+    , PristineHash (..)
    , PatchInfo (..)
    , PatchSeq (..)
    )
 where

+-- TODO
+--
+-- Apparently, after a while, some of the patches are moved from
+-- hashed_inventory into the inventories/ dir. So the patch set contains more
+-- than one group. This means I need to extend my parser to cover this case.
+-- Sources for info about this thing:
+--
+-- * Darcs source code
+-- * Darcs wiki
+-- * Local Darcs repos I have
+--
+-- From Darcs source code:
+--
+-- > The patches in a repository are stored in chunks broken up at \"clean\"
+-- > tags. A tag is clean if the only patches before it in the current
+-- > repository ordering are ones that the tag depends on (either directly
+-- > or indirectly). Each chunk is stored in a separate inventory file on disk.
+-- >
+-- > A 'PatchSet' represents a repo's history as the list of patches since the
+-- > last clean tag, and then a list of patch lists each delimited by clean tags.
+-- >
+-- > A 'Tagged' is a single chunk of a 'PatchSet'.  It has a 'PatchInfo'
+-- > representing a clean tag, the hash of the previous inventory (if it exists),
+-- > and the list of patches since that previous inventory.
+--
+-- Let's start with finding out the format of the binary inventories and
+-- parsing them.
+
 import Prelude

 import Data.ByteString (ByteString)
 import Data.Text (Text)
 import Data.Time.Clock (UTCTime)
-import Storage.Hashed.Hash (Hash)

 -- | Patch metadata in raw form. This is intended for accurate hashing of the
 -- patch info.
 data PatchInfoRaw = PatchInfoRaw
    { pirAuthor      :: ByteString
-    , pirHash        :: ByteString
+    , pirHash        :: (Int, ByteString)
    , pirTitle       :: ByteString
    , pirDescription :: [ByteString]
    , pirJunkPrefix  :: ByteString
@ -40,14 +71,36 @@ data PatchInfoRaw = PatchInfoRaw
    , pirInverted    :: Bool
    }

+-- | A SHA1 hash of the patch info (author, title, description including junk,
+-- timestamp). The hash is in binary form, not hex, i.e. its size is always 20
+-- bytes.
+newtype PatchHash = PatchHash { unPatchHash :: ByteString }
+
+-- | Content size and SHA256 hash of a patch's info and content. The hash is in
+-- binary form, not hex, i.e. its size is always 32 bytes.
+data ContentHash = ContentHash
+    { chSize :: Int
+    , chHash :: ByteString
+    }
+
+-- | Content size and SHA256 hash of an inventory (a patch set in a single
+-- invetory file). The hash is in binary form, not hex, i.e. its size is always
+-- 32 bytes.
+data InventoryHash = InventoryHash
+    { ihSize :: Int
+    , ihHash :: ByteString
+    }
+
+-- | A SHA256 hash of the entire recorded state of the repo. The hash is in
+-- binary form, not hex, i.e. its size is always 32 bytes.
+newtype PristineHash = PristineHash { unPristineHash :: ByteString }
+
 -- | Patch metadata read from the inventory file.
 data PatchInfo = PatchInfo
    { -- | Author name and email
      piAuthor      :: Text
-      -- | Currently this is the patch hash in textual form. Possibly I'll
-      -- change to binary form when I find out the encoding scheme of the hash
-      -- string.
-    , piHash        :: ByteString
+      -- | Patch content hash
+    , piHash        :: ContentHash
      -- | Single message line
    , piTitle       :: Text
      -- | Optional description, may contain several lines
@ -60,7 +113,7 @@ data PatchInfo = PatchInfo

 -- | The information from the hashed inventory file.
 data PatchSeq = PatchSeq
-    { psPristineHash :: Hash
-    , psPristineSize :: Maybe Int
-    , psPatches      :: [(PatchInfo, ByteString)]
+    { psPristineHash :: PristineHash
+    , psPrevious     :: Maybe InventoryHash
+    , psPatches      :: [(PatchInfo, PatchHash)]
    }
--- a/src/Vervis/Darcs.hs
+++ b/src/Vervis/Darcs.hs
@ -132,7 +132,7 @@ readChangesView path off lim = fmap maybeRight $ runExceptT $ do
    let toLE pi h = LogEntry
            { leAuthor  =
                T.stripEnd $ T.takeWhile (/= '<') $ piAuthor pi
-            , leHash    = decodeStrict $ B16.encode h
+            , leHash    = decodeStrict $ B16.encode $ unPatchHash h
            , leMessage = piTitle pi
            , leTime    =
                intervalToEventTime $
--- a/vervis.cabal
+++ b/vervis.cabal
@ -134,6 +134,8 @@ library
                     , blaze-markup
                     , byteable
                     , bytestring
+                       -- for Darcs.Local.PatchInfo.Parser
+                     , bytestring-lexing
                     , case-insensitive
                     , classy-prelude
                     , classy-prelude-conduit