Change storage format of saved analysis.

This commit is contained in:
Jonathan Bernard 2024-02-14 12:18:40 -06:00
parent de9ff2b54a
commit ce821d8f53
4 changed files with 62 additions and 40 deletions

@ -1,4 +1,4 @@
const VERSION* = "1.4.6" const VERSION* = "2.0.0"
const USAGE* = """ const USAGE* = """
Usage: Usage:

@ -1,5 +1,4 @@
import md5, streams import md5, streams
import os
proc fileToMD5*(filename: string) : string = proc fileToMD5*(filename: string) : string =

@ -3,7 +3,8 @@
## ##
## Utility to compare the file contents of two directory trees. ## Utility to compare the file contents of two directory trees.
import os, tables, streams, sequtils, strutils, docopt, marshal import std/[json, jsonutils, os, tables, sequtils, strutils]
import docopt
import incremental_md5, console_progress import incremental_md5, console_progress
import ./cliconstants import ./cliconstants
@ -69,94 +70,116 @@ proc getRelPath(ancestor, child: string): string =
type type
FileEntry* = tuple[relPath: string, checksum: string] FileEntry* = ref tuple[relPath: string, checksum: string]
## Data about one file that has been analyzed ## Data about one file that has been analyzed
DirAnalysis* = ## Analysis data about one directory tree. DirAnalysis* = ## Analysis data about one directory tree.
tuple[allEntries: seq[ref FileEntry], tuple[allEntries: seq[FileEntry],
byRelPath: ref Table[string, ref FileEntry], byRelPath: TableRef[string, FileEntry],
byChecksum: ref Table[string, seq[ref FileEntry]]] byChecksum: TableRef[string, seq[FileEntry]]]
DisplayOptions = tuple[left, right, same, content, path: bool] DisplayOptions = tuple[left, right, same, content, path: bool]
## Consolidated description of which types of results to display. ## Consolidated description of which types of results to display.
func `$`(f: FileEntry): string = f.checksum & ": " & f.relPath
proc getOrFail(n: JsonNode, key: string, objName: string = ""): JsonNode =
## convenience method to get a key from a JObject or raise an exception
if not n.hasKey(key): raise newException(Exception, objName & " missing key '" & key & "'")
return n[key]
proc getIfExists(n: JsonNode, key: string): JsonNode =
## convenience method to get a key from a JObject or return null
result = if n.hasKey(key): n[key]
else: newJNull()
func parseFileEntry(n: JsonNode): FileEntry =
result = new(FileEntry)
result.relPath = n.getOrFail("relPath").getStr
result.checksum = n.getOrFail("checksum").getStr
func initDirAnalysis(): DirAnalysis =
(allEntries: @[],
byRelPath: newTable[string, FileEntry](),
byChecksum: newTable[string, seq[FileEntry]]())
func indexEntries(da: var DirAnalysis) =
for e in da.allEntries:
da.byRelPath[e.relPath] = e
if not da.byChecksum.hasKey(e.checksum):
da.byChecksum[e.checksum] = newSeq[FileEntry]()
da.byChecksum[e.checksum].add(e)
proc analyzeDir*(root: string, progress: ProgressWrapper): DirAnalysis = proc analyzeDir*(root: string, progress: ProgressWrapper): DirAnalysis =
## Inspect a directory and analyze all files, noting their relative paths and ## Inspect a directory and analyze all files, noting their relative paths and
## checksum of their contents. ## checksum of their contents.
let fileCount = countFiles(root) let fileCount = countFiles(root)
progress.init(root, fileCount) progress.init(root, fileCount + 10)
result = (allEntries: @[], result = initDirAnalysis()
byRelPath: newTable[string, ref FileEntry](),
byChecksum: newTable[string, seq[ref FileEntry]]())
var count = 0 var count = 0
for file in walkDirRec(root): for file in walkDirRec(root):
# Compute checksum
let md5sum = fileToMd5(file) let md5sum = fileToMd5(file)
var fileEntry: ref FileEntry = new(ref FileEntry)
fileEntry[] = (relPath: getRelPath(root, file), checksum: md5sum )
# Add to allEntries list, byRelPath table, and byChecksum table var fileEntry: FileEntry = new(FileEntry)
fileEntry[] = (relPath: getRelPath(root, file), checksum: md5sum)
result.allEntries.add(fileEntry) result.allEntries.add(fileEntry)
result.byRelPath[fileEntry.relPath] = fileEntry
if not result.byChecksum.hasKey(fileEntry.relPath):
result.byChecksum[fileEntry.checksum] = newSeq[ref FileEntry]()
result.byChecksum[fileEntry.checksum].add(fileEntry)
progress.update(count, file) progress.update(count, file)
count += 1 count += 1
result.indexEntries
count += 10
progress.finish() progress.finish()
proc loadAnalysis*(path: string, analysis: var DirAnalysis) = proc loadAnalysis*(path: string): DirAnalysis =
## Load a previously performed directory analysis. ## Load a previously performed directory analysis.
let inStream: Stream = newFileStream(path, fmRead) let allEntriesJson = parseJson(readFile(path))
load(inStream, analysis) result = initDirAnalysis()
result.allEntries = toSeq(items(allEntriesJson)).map(parseFileEntry)
result.indexEntries
proc saveAnalysis*(path: string, analysis: DirAnalysis): void = proc saveAnalysis*(path: string, analysis: DirAnalysis): void =
## Save a completed analysis. ## Save a completed analysis.
let outStream = newFileStream(path, fmWrite) writeFile(path, $(analysis.allEntries.toJson))
store(outStream, analysis)
proc intersection*(left, right: DirAnalysis): seq[ref FileEntry] = proc intersection*(left, right: DirAnalysis): seq[FileEntry] =
## Find all ``FileEntry`` that are the same on both sides: matching contents ## Find all ``FileEntry`` that are the same on both sides: matching contents
## and paths. ## and paths.
return left.allEntries.filter do (item: ref FileEntry) -> bool: return left.allEntries.filter do (item: FileEntry) -> bool:
if not right.byRelPath.hasKey(item.relPath): return false if not right.byRelPath.hasKey(item.relPath): return false
let match = right.byRelPath[item.relPath] let match = right.byRelPath[item.relPath]
if match == nil: return false
return item.checksum == match.checksum return item.checksum == match.checksum
proc difference*(left, right: DirAnalysis): seq[ref FileEntry] = proc difference*(left, right: DirAnalysis): seq[FileEntry] =
## Find all ``FileEntry`` that are present in the left but not present in ## Find all ``FileEntry`` that are present in the left but not present in
## the right. ## the right.
return left.allEntries.filter do (item: ref FileEntry) -> bool: return left.allEntries.filter do (item: FileEntry) -> bool:
return not right.byRelPath.hasKey(item.relPath) and return not right.byRelPath.hasKey(item.relPath) and
not right.byChecksum.hasKey(item.checksum) not right.byChecksum.hasKey(item.checksum)
proc `*`*(left, right: DirAnalysis): seq[ref FileEntry] {.inline.} = proc `*`*(left, right: DirAnalysis): seq[FileEntry] {.inline.} =
## Alias for `intersection(left, right) <#intersection>`_ ## Alias for `intersection(left, right) <#intersection>`_
return intersection(left, right) return intersection(left, right)
proc `-`*(left, right: DirAnalysis): seq[ref FileEntry] {.inline.} = proc `-`*(left, right: DirAnalysis): seq[FileEntry] {.inline.} =
## Alias for `difference(left, right) <#difference>`_ ## Alias for `difference(left, right) <#difference>`_
return difference(left, right) return difference(left, right)
proc samePathDifferentContents*(left, right: DirAnalysis): seq[string] = proc samePathDifferentContents*(left, right: DirAnalysis): seq[string] =
## Find all ``FileEntry`` that have the same paths in both trees but whose ## Find all ``FileEntry`` that have the same paths in both trees but whose
## contents differ. ## contents differ.
let matchingEntries = left.allEntries.filter do (item: ref FileEntry) -> bool: let matchingEntries = left.allEntries.filter do (item: FileEntry) -> bool:
if not right.byRelPath.hasKey(item.relPath): return false if not right.byRelPath.hasKey(item.relPath): return false
let match = right.byRelPath[item.relPath] let match = right.byRelPath[item.relPath]
return item.checksum != match.checksum return item.checksum != match.checksum
return matchingEntries.map(proc(item: ref FileEntry): string = return item.relPath) return matchingEntries.map(proc(item: FileEntry): string = return item.relPath)
proc sameContentsDifferentPaths*(left, right: DirAnalysis): seq[tuple[left, right: ref FileEntry]] = proc sameContentsDifferentPaths*(left, right: DirAnalysis): seq[tuple[left, right: FileEntry]] =
## Find all ``FileEntry`` whose contents are the same in both trees but ## Find all ``FileEntry`` whose contents are the same in both trees but
## which are located at differenc paths. ## which are located at differenc paths.
result = @[] result = @[]
@ -192,7 +215,7 @@ when isMainModule:
if fileInfo.kind == pcDir: if fileInfo.kind == pcDir:
return analyzeDir(path, progressWrapper) return analyzeDir(path, progressWrapper)
elif fileInfo.kind == pcFile: elif fileInfo.kind == pcFile:
loadAnalysis(path, result) result = loadAnalysis(path)
else: else:
quitWithError($path & ": is not a file or directory") quitWithError($path & ": is not a file or directory")
@ -202,8 +225,8 @@ when isMainModule:
if not args["<right>"]: if not args["<right>"]:
rightAnalysis = (allEntries: @[], rightAnalysis = (allEntries: @[],
byRelPath: newTable[string, ref FileEntry](), byRelPath: newTable[string, FileEntry](),
byChecksum: newTable[string, seq[ref FileEntry]]()) byChecksum: newTable[string, seq[FileEntry]]())
else: else:
var rightPath: string = $args["<right>"] var rightPath: string = $args["<right>"]
rightAnalysis = loadPath(rightPath) rightAnalysis = loadPath(rightPath)

@ -1,5 +1,5 @@
# Package # Package
version = "1.4.6" version = "2.0.0"
author = "Jonathan Bernard (jdb@jdb-labs.com)" author = "Jonathan Bernard (jdb@jdb-labs.com)"
description = "Utility to generate diffs of full directory trees." description = "Utility to generate diffs of full directory trees."
license = "BSD" license = "BSD"