From 5d0114d0e979d69932f58b631650676727fcdd5d Mon Sep 17 00:00:00 2001 From: deadc0de6 Date: Fri, 27 Mar 2020 10:05:26 +0100 Subject: [PATCH] hash diff for #10 --- README.md | 5 +++-- catcli/noder.py | 31 +++++++++++++++++++++------- catcli/utils.py | 2 +- catcli/walker.py | 21 +++++++++---------- tests/helpers.py | 20 ++++++++++++++++++ tests/test_update.py | 48 ++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 104 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index ea6c0a4..7e4e452 100644 --- a/README.md +++ b/README.md @@ -200,8 +200,9 @@ Storage entry can be edited with following catcli commands: ## Update catalog The catalog can be updated with the `update` command. -Updates are based on the access time of each of the files. If using -`-c --hash`, only new files are re-hashed. +Updates are based on the access time of each of the files and on the +hash checksum if present (catalog was indexed with `-c --hash` and +`update` is called with the switch `-c --hash`). # Examples diff --git a/catcli/noder.py b/catcli/noder.py index ba92479..bd99449 100644 --- a/catcli/noder.py +++ b/catcli/noder.py @@ -67,20 +67,29 @@ class Noder: Logger.err('No node at path \"{}\"'.format(path)) return None - def get_node_if_newer(self, top, path, maccess): - '''return the node (if any) and if path is newer''' + def get_node_if_changed(self, top, path): + '''return the node (if any) and if it has changed''' treepath = path.lstrip(os.sep) node = self.get_node(top, treepath, quiet=True) + # node does not exist if not node: - # node does not exist return None, True + # force re-indexing if no maccess + maccess = os.path.getmtime(path) if not self._has_attr(node, 'maccess') or \ not node.maccess: - # force re-indexing if no maccess return node, True + # maccess changed old_maccess = node.maccess if float(maccess) > float(old_maccess): + self._debug('macess changed for \"{}\"'.format(path)) return node, True + # test hash + if self.hash and node.md5: + md5 = self._get_hash(path) + if md5 != node.md5: + self._debug('checksum changed for \"{}\"'.format(path)) + return node, True return node, False def get_meta_node(self, top): @@ -96,8 +105,7 @@ class Noder: recursively traverse tree and return size @store: store the size in the node ''' - if self.verbose: - Logger.info('getting node size recursively') + self._debug('getting node size recursively') if node.type == self.TYPE_FILE: return node.size size = 0 @@ -168,7 +176,7 @@ class Noder: return None md5 = None if self.hash: - md5 = utils.md5sum(path) + md5 = self._get_hash(path) relpath = os.sep.join([storagepath, name]) maccess = os.path.getmtime(path) @@ -461,3 +469,12 @@ class Noder: if parent: return os.sep.join([parent, node.name]) return node.name + + def _get_hash(self, path): + """return md5 hash of node""" + return utils.md5sum(path) + + def _debug(self, string): + if not self.verbose: + return + Logger.info('getting node size recursively') diff --git a/catcli/utils.py b/catcli/utils.py index 5267737..3fe7c0d 100644 --- a/catcli/utils.py +++ b/catcli/utils.py @@ -19,7 +19,7 @@ def md5sum(path): '''calculate md5 sum of a file''' p = os.path.realpath(path) if not os.path.exists(p): - Logger.err('\nunable to get md5sum on {}'.format(path)) + Logger.err('\nmd5sum - file does not exist: {}'.format(p)) return None try: with open(p, mode='rb') as f: diff --git a/catcli/walker.py b/catcli/walker.py index 80e99b3..f1e8678 100644 --- a/catcli/walker.py +++ b/catcli/walker.py @@ -17,7 +17,8 @@ class Walker: def __init__(self, noder, nohash=False, debug=False): self.noder = noder - self.noder.set_hashing(not nohash) + self.nohash = nohash + self.noder.set_hashing(not self.nohash) self.debug = debug def index(self, path, parent, name, storagepath=''): @@ -66,8 +67,7 @@ class Walker: for f in files: self._debug('found file {} under {}'.format(f, path)) sub = os.path.join(root, f) - maccess = os.path.getmtime(sub) - reindex, n = self._need_reindex(parent, f, maccess) + reindex, n = self._need_reindex(parent, sub) if not reindex: self._debug('\tignore file {}'.format(sub)) self.noder.flag(n) @@ -82,8 +82,7 @@ class Walker: self._debug('found dir {} under {}'.format(d, path)) base = os.path.basename(d) sub = os.path.join(root, d) - maccess = os.path.getmtime(sub) - reindex, dummy = self._need_reindex(parent, base, maccess) + reindex, dummy = self._need_reindex(parent, sub) if reindex: self._debug('\tre-index directory {}'.format(sub)) dummy = self.noder.dir_node(base, sub, parent, storagepath) @@ -99,19 +98,19 @@ class Walker: self._log(None) return cnt - def _need_reindex(self, top, path, maccess): + def _need_reindex(self, top, path): '''test if node needs re-indexing''' - cnode, newer = self.noder.get_node_if_newer(top, path, maccess) + cnode, changed = self.noder.get_node_if_changed(top, path) if not cnode: self._debug('\tdoes not exist') return True, cnode - if cnode and not newer: + if cnode and not changed: # ignore this node - self._debug('\tis not newer') + self._debug('\thas not changed') return False, cnode - if cnode and newer: + if cnode and changed: # remove this node and re-add - self._debug('\tis newer') + self._debug('\thas changed') self._debug('\tremoving node {}'.format(cnode)) cnode.parent = None self._debug('\tis to be re-indexed') diff --git a/tests/helpers.py b/tests/helpers.py index ae5624b..7c7a0c7 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -11,6 +11,7 @@ import random import tempfile import shutil import subprocess +import hashlib TMPSUFFIX = '.catcli' @@ -25,6 +26,25 @@ def get_rnd_string(length): return ''.join(random.choice(alpha) for _ in range(length)) +def md5sum(path): + '''calculate md5 sum of a file''' + p = os.path.realpath(path) + if not os.path.exists(p): + return None + try: + with open(p, mode='rb') as f: + d = hashlib.md5() + while True: + buf = f.read(4096) + if not buf: + break + d.update(buf) + return d.hexdigest() + except PermissionError: + pass + return None + + def clean(path): '''Delete file or folder.''' if not os.path.exists(path): diff --git a/tests/test_update.py b/tests/test_update.py index 39eb02c..2f54b27 100644 --- a/tests/test_update.py +++ b/tests/test_update.py @@ -12,7 +12,7 @@ from catcli.catcli import cmd_index, cmd_update from catcli.noder import Noder from catcli.catalog import Catalog from tests.helpers import create_dir, create_rnd_file, get_tempdir, \ - clean, unix_tree, edit_file, read_from_file + clean, unix_tree, edit_file, read_from_file, md5sum import anytree @@ -31,6 +31,7 @@ class TestIndexing(unittest.TestCase): f1 = create_rnd_file(dirpath, 'file1') f2 = create_rnd_file(dirpath, 'file2') f3 = create_rnd_file(dirpath, 'file3') + f4 = create_rnd_file(dirpath, 'file4') # create 2 directories d1 = create_dir(dirpath, 'dir1') @@ -45,6 +46,12 @@ class TestIndexing(unittest.TestCase): top = noder.new_top_node() catalog = Catalog(catalogpath, force=True, verbose=False) + # get checksums + f4_md5 = md5sum(f4) + self.assertTrue(f4_md5) + d1f1_md5 = md5sum(d1f1) + self.assertTrue(d1f1_md5) + # create fake args tmpdirname = 'tmpdir' args = {'': dirpath, '': tmpdirname, @@ -56,6 +63,13 @@ class TestIndexing(unittest.TestCase): cmd_index(args, noder, catalog, top, debug=True) self.assertTrue(os.stat(catalogpath).st_size != 0) + # ensure md5 sum are in + nods = noder.find_name(top, os.path.basename(f4)) + self.assertTrue(len(nods) == 1) + nod = nods[0] + self.assertTrue(nod) + self.assertTrue(nod.md5 == f4_md5) + # print catalog noder.print_tree(top) @@ -70,6 +84,19 @@ class TestIndexing(unittest.TestCase): # modify files EDIT = 'edited' edit_file(d1f1, EDIT) + d1f1_md5_new = md5sum(d1f1) + self.assertTrue(d1f1_md5_new) + self.assertTrue(d1f1_md5_new != d1f1_md5) + + # change file without mtime + maccess = os.path.getmtime(f4) + EDIT = 'edited' + edit_file(f4, EDIT) + # reset edit time + os.utime(f4, (maccess, maccess)) + f4_md5_new = md5sum(d1f1) + self.assertTrue(f4_md5_new) + self.assertTrue(f4_md5_new != f4_md5) # update storage cmd_update(args, noder, catalog, top, debug=True) @@ -81,7 +108,23 @@ class TestIndexing(unittest.TestCase): # explore the top node to find all nodes self.assertTrue(len(top.children) == 1) storage = top.children[0] - self.assertTrue(len(storage.children) == 7) + self.assertTrue(len(storage.children) == 8) + + # ensure d1f1 md5 sum has changed in catalog + nods = noder.find_name(top, os.path.basename(d1f1)) + self.assertTrue(len(nods) == 1) + nod = nods[0] + self.assertTrue(nod) + self.assertTrue(nod.md5 != d1f1_md5) + self.assertTrue(nod.md5 == d1f1_md5_new) + + # ensure f4 md5 sum has changed in catalog + nods = noder.find_name(top, os.path.basename(f4)) + self.assertTrue(len(nods) == 1) + nod = nods[0] + self.assertTrue(nod) + self.assertTrue(nod.md5 != f4_md5) + self.assertTrue(nod.md5 == f4_md5_new) # ensures files and directories are in names = [node.name for node in anytree.PreOrderIter(storage)] @@ -89,6 +132,7 @@ class TestIndexing(unittest.TestCase): self.assertTrue(os.path.basename(f1) in names) self.assertTrue(os.path.basename(f2) in names) self.assertTrue(os.path.basename(f3) in names) + self.assertTrue(os.path.basename(f4) in names) self.assertTrue(os.path.basename(d1) in names) self.assertTrue(os.path.basename(d1f1) in names) self.assertTrue(os.path.basename(d1f2) in names)