hash diff for #10

pull/19/head
deadc0de6 4 years ago
parent a519d0a36c
commit 5d0114d0e9

@ -200,8 +200,9 @@ Storage entry can be edited with following catcli commands:
## Update catalog
The catalog can be updated with the `update` command.
Updates are based on the access time of each of the files. If using
`-c --hash`, only new files are re-hashed.
Updates are based on the access time of each of the files and on the
hash checksum if present (catalog was indexed with `-c --hash` and
`update` is called with the switch `-c --hash`).
# Examples

@ -67,20 +67,29 @@ class Noder:
Logger.err('No node at path \"{}\"'.format(path))
return None
def get_node_if_newer(self, top, path, maccess):
'''return the node (if any) and if path is newer'''
def get_node_if_changed(self, top, path):
'''return the node (if any) and if it has changed'''
treepath = path.lstrip(os.sep)
node = self.get_node(top, treepath, quiet=True)
# node does not exist
if not node:
# node does not exist
return None, True
# force re-indexing if no maccess
maccess = os.path.getmtime(path)
if not self._has_attr(node, 'maccess') or \
not node.maccess:
# force re-indexing if no maccess
return node, True
# maccess changed
old_maccess = node.maccess
if float(maccess) > float(old_maccess):
self._debug('macess changed for \"{}\"'.format(path))
return node, True
# test hash
if self.hash and node.md5:
md5 = self._get_hash(path)
if md5 != node.md5:
self._debug('checksum changed for \"{}\"'.format(path))
return node, True
return node, False
def get_meta_node(self, top):
@ -96,8 +105,7 @@ class Noder:
recursively traverse tree and return size
@store: store the size in the node
'''
if self.verbose:
Logger.info('getting node size recursively')
self._debug('getting node size recursively')
if node.type == self.TYPE_FILE:
return node.size
size = 0
@ -168,7 +176,7 @@ class Noder:
return None
md5 = None
if self.hash:
md5 = utils.md5sum(path)
md5 = self._get_hash(path)
relpath = os.sep.join([storagepath, name])
maccess = os.path.getmtime(path)
@ -461,3 +469,12 @@ class Noder:
if parent:
return os.sep.join([parent, node.name])
return node.name
def _get_hash(self, path):
"""return md5 hash of node"""
return utils.md5sum(path)
def _debug(self, string):
if not self.verbose:
return
Logger.info('getting node size recursively')

@ -19,7 +19,7 @@ def md5sum(path):
'''calculate md5 sum of a file'''
p = os.path.realpath(path)
if not os.path.exists(p):
Logger.err('\nunable to get md5sum on {}'.format(path))
Logger.err('\nmd5sum - file does not exist: {}'.format(p))
return None
try:
with open(p, mode='rb') as f:

@ -17,7 +17,8 @@ class Walker:
def __init__(self, noder, nohash=False, debug=False):
self.noder = noder
self.noder.set_hashing(not nohash)
self.nohash = nohash
self.noder.set_hashing(not self.nohash)
self.debug = debug
def index(self, path, parent, name, storagepath=''):
@ -66,8 +67,7 @@ class Walker:
for f in files:
self._debug('found file {} under {}'.format(f, path))
sub = os.path.join(root, f)
maccess = os.path.getmtime(sub)
reindex, n = self._need_reindex(parent, f, maccess)
reindex, n = self._need_reindex(parent, sub)
if not reindex:
self._debug('\tignore file {}'.format(sub))
self.noder.flag(n)
@ -82,8 +82,7 @@ class Walker:
self._debug('found dir {} under {}'.format(d, path))
base = os.path.basename(d)
sub = os.path.join(root, d)
maccess = os.path.getmtime(sub)
reindex, dummy = self._need_reindex(parent, base, maccess)
reindex, dummy = self._need_reindex(parent, sub)
if reindex:
self._debug('\tre-index directory {}'.format(sub))
dummy = self.noder.dir_node(base, sub, parent, storagepath)
@ -99,19 +98,19 @@ class Walker:
self._log(None)
return cnt
def _need_reindex(self, top, path, maccess):
def _need_reindex(self, top, path):
'''test if node needs re-indexing'''
cnode, newer = self.noder.get_node_if_newer(top, path, maccess)
cnode, changed = self.noder.get_node_if_changed(top, path)
if not cnode:
self._debug('\tdoes not exist')
return True, cnode
if cnode and not newer:
if cnode and not changed:
# ignore this node
self._debug('\tis not newer')
self._debug('\thas not changed')
return False, cnode
if cnode and newer:
if cnode and changed:
# remove this node and re-add
self._debug('\tis newer')
self._debug('\thas changed')
self._debug('\tremoving node {}'.format(cnode))
cnode.parent = None
self._debug('\tis to be re-indexed')

@ -11,6 +11,7 @@ import random
import tempfile
import shutil
import subprocess
import hashlib
TMPSUFFIX = '.catcli'
@ -25,6 +26,25 @@ def get_rnd_string(length):
return ''.join(random.choice(alpha) for _ in range(length))
def md5sum(path):
'''calculate md5 sum of a file'''
p = os.path.realpath(path)
if not os.path.exists(p):
return None
try:
with open(p, mode='rb') as f:
d = hashlib.md5()
while True:
buf = f.read(4096)
if not buf:
break
d.update(buf)
return d.hexdigest()
except PermissionError:
pass
return None
def clean(path):
'''Delete file or folder.'''
if not os.path.exists(path):

@ -12,7 +12,7 @@ from catcli.catcli import cmd_index, cmd_update
from catcli.noder import Noder
from catcli.catalog import Catalog
from tests.helpers import create_dir, create_rnd_file, get_tempdir, \
clean, unix_tree, edit_file, read_from_file
clean, unix_tree, edit_file, read_from_file, md5sum
import anytree
@ -31,6 +31,7 @@ class TestIndexing(unittest.TestCase):
f1 = create_rnd_file(dirpath, 'file1')
f2 = create_rnd_file(dirpath, 'file2')
f3 = create_rnd_file(dirpath, 'file3')
f4 = create_rnd_file(dirpath, 'file4')
# create 2 directories
d1 = create_dir(dirpath, 'dir1')
@ -45,6 +46,12 @@ class TestIndexing(unittest.TestCase):
top = noder.new_top_node()
catalog = Catalog(catalogpath, force=True, verbose=False)
# get checksums
f4_md5 = md5sum(f4)
self.assertTrue(f4_md5)
d1f1_md5 = md5sum(d1f1)
self.assertTrue(d1f1_md5)
# create fake args
tmpdirname = 'tmpdir'
args = {'<path>': dirpath, '<name>': tmpdirname,
@ -56,6 +63,13 @@ class TestIndexing(unittest.TestCase):
cmd_index(args, noder, catalog, top, debug=True)
self.assertTrue(os.stat(catalogpath).st_size != 0)
# ensure md5 sum are in
nods = noder.find_name(top, os.path.basename(f4))
self.assertTrue(len(nods) == 1)
nod = nods[0]
self.assertTrue(nod)
self.assertTrue(nod.md5 == f4_md5)
# print catalog
noder.print_tree(top)
@ -70,6 +84,19 @@ class TestIndexing(unittest.TestCase):
# modify files
EDIT = 'edited'
edit_file(d1f1, EDIT)
d1f1_md5_new = md5sum(d1f1)
self.assertTrue(d1f1_md5_new)
self.assertTrue(d1f1_md5_new != d1f1_md5)
# change file without mtime
maccess = os.path.getmtime(f4)
EDIT = 'edited'
edit_file(f4, EDIT)
# reset edit time
os.utime(f4, (maccess, maccess))
f4_md5_new = md5sum(d1f1)
self.assertTrue(f4_md5_new)
self.assertTrue(f4_md5_new != f4_md5)
# update storage
cmd_update(args, noder, catalog, top, debug=True)
@ -81,7 +108,23 @@ class TestIndexing(unittest.TestCase):
# explore the top node to find all nodes
self.assertTrue(len(top.children) == 1)
storage = top.children[0]
self.assertTrue(len(storage.children) == 7)
self.assertTrue(len(storage.children) == 8)
# ensure d1f1 md5 sum has changed in catalog
nods = noder.find_name(top, os.path.basename(d1f1))
self.assertTrue(len(nods) == 1)
nod = nods[0]
self.assertTrue(nod)
self.assertTrue(nod.md5 != d1f1_md5)
self.assertTrue(nod.md5 == d1f1_md5_new)
# ensure f4 md5 sum has changed in catalog
nods = noder.find_name(top, os.path.basename(f4))
self.assertTrue(len(nods) == 1)
nod = nods[0]
self.assertTrue(nod)
self.assertTrue(nod.md5 != f4_md5)
self.assertTrue(nod.md5 == f4_md5_new)
# ensures files and directories are in
names = [node.name for node in anytree.PreOrderIter(storage)]
@ -89,6 +132,7 @@ class TestIndexing(unittest.TestCase):
self.assertTrue(os.path.basename(f1) in names)
self.assertTrue(os.path.basename(f2) in names)
self.assertTrue(os.path.basename(f3) in names)
self.assertTrue(os.path.basename(f4) in names)
self.assertTrue(os.path.basename(d1) in names)
self.assertTrue(os.path.basename(d1f1) in names)
self.assertTrue(os.path.basename(d1f2) in names)

Loading…
Cancel
Save