[PATCH file v2] make hash algorithm configurable

David Douard david.douard at logilab.fr
Tue Jun 12 17:54:13 CEST 2018


# HG changeset patch
# User David Douard <david.douard at logilab.fr>
# Date 1528728464 -7200
#      Mon Jun 11 16:47:44 2018 +0200
# Node ID 262eee49761ef29b34e6d35b4b0da3a305a0c15e
# Parent  1ceaa1c57915d20320f75de75bc051d961b3270a
make hash algorithm configurable

- replace File.data_sha1hex by File.data_hash
- update hooks accordingly
- store the hash with a "header" ("{SHA1}", "{MD5}" etc.)
- deprecate 'File.compute_sha1hex()' method
- add a 'File.compute_hash()' method
- add an helper 'File.check_hash()' method
- rename 'compute-sha1hex' config option as 'compute-hash'
- add a 'hash-algorithm' config option
- add a migration script

diff --git a/cubicweb_file/entities.py b/cubicweb_file/entities.py
--- a/cubicweb_file/entities.py
+++ b/cubicweb_file/entities.py
@@ -7,7 +7,7 @@
 __docformat__ = "restructuredtext en"
 from warnings import warn
 import tempfile
-
+import re
 import os
 from os.path import join, splitext, exists, isfile, isabs
 import hashlib
@@ -94,11 +94,31 @@
             if rpath is not None:
                 return self._cw.data_url(iconfile)
 
-    def compute_sha1hex(self, value=None):
+    def check_hash(self, value=None):
+        if self.data_hash:
+            m = re.match('{(?P<alg>[A-Za-z0-9]+)}(?P<hash>.+)', self.data_hash)
+            if m:
+                alg = m.group('alg')
+                hashhex = m.group('hash')
+            else:  # bw compat, used to be sha1
+                alg = 'sha1'
+                hashhex = self.data_hash
+            comphash = self.compute_hash(alg=alg)
+            return comphash == hashhex
+        return False  # ?or True?
+
+    def compute_hash(self, value=None, alg=None):
         if value is None and self.data is not None:
             value = self.data.getvalue()
         if value is not None:
-            return text_type(hashlib.sha1(value).hexdigest())
+            if alg is None:
+                alg = self._cw.vreg.config['hash-algorithm']
+            hasher = hashlib.new(alg, value)
+            return text_type(hasher.hexdigest())
+
+    @deprecated('[file 2.1] use entity.compute_hash() instead')
+    def compute_sha1hex(self, value=None):
+        return self.compute_hash(value, alg='sha1')
 
     @deprecated('[file 1.9] use entity.cw_adapt_to("IDownloadable").download_url() instead')
     def download_url(self):
@@ -192,12 +212,9 @@
     def _thumbnail_path(self):
         cachedir = thumb_cache_dir(self._cw.vreg.config)
         if cachedir:
-            if self.entity.data_sha1hex:
-                sha1 = self.entity.data_sha1hex
-            else:
-                sha1 = self.entity.compute_sha1hex()
+            hashhex = self.entity.compute_hash()
             size = self._cw.vreg.config['image-thumb-size']
-            return join(cachedir, '%s_%s.png' % (sha1, size))
+            return join(cachedir, '%s_%s.png' % (hashhex, size))
         return None
 
     def thumbnail_file_name(self):
diff --git a/cubicweb_file/hooks.py b/cubicweb_file/hooks.py
--- a/cubicweb_file/hooks.py
+++ b/cubicweb_file/hooks.py
@@ -22,7 +22,8 @@
     __regid__ = 'updatefilehook'
     __select__ = hook.Hook.__select__ & is_instance('File')
     events = ('before_add_entity', 'before_update_entity',)
-    order = -1 # should be run before other hooks
+    order = -1  # should be run before other hooks
+    category = 'hash'
 
     def __call__(self):
         edited = self.entity.cw_edited
@@ -38,11 +39,13 @@
                     # (original image will be stored)
                     pass
 
-            if self._cw.vreg.config['compute-sha1hex']:
+            if self._cw.vreg.config['compute-hash']:
                 data = edited['data']
                 if data is not None:
-                    data = self.entity.compute_sha1hex(data.getvalue())
-                edited['data_sha1hex'] = data
+                    hashdata = self.entity.compute_hash(data.getvalue())
+                    edited['data_hash'] = '{%s}%s' % (
+                        self._cw.vreg.config['hash-algorithm'],
+                        hashdata)
 
             # thumbnail cache invalidation
             if 'update' in self.event and 'data' in edited:
diff --git a/cubicweb_file/migration/2.1.0_Any.py b/cubicweb_file/migration/2.1.0_Any.py
new file mode 100644
--- /dev/null
+++ b/cubicweb_file/migration/2.1.0_Any.py
@@ -0,0 +1,21 @@
+
+subclasses = False
+is_rel = 'is'
+
+if confirm('Also handle entity types that inherit from File?'):
+    subclasses = True
+    is_rel = 'is_instance_of'
+
+add_attribute('File', 'data_hash')
+if subclasses:
+    for etype in schema['File'].specialized_by(recursive=True):
+        add_attribute(etype.type, 'data_hash')
+
+rql('SET X data_hash "{SHA1}"+H WHERE X %(rel)s File, '
+    'X data_sha1hex H, '
+    'NOT X data_sha1hex NULL' % is_rel)
+
+drop_attribute('File', 'data_sha1hex')
+if subclasses:
+    for etype in schema['File'].specialized_by(recursive=True):
+        drop_attribute(etype.type, 'data_sha1hex')
diff --git a/cubicweb_file/schema.py b/cubicweb_file/schema.py
--- a/cubicweb_file/schema.py
+++ b/cubicweb_file/schema.py
@@ -19,12 +19,12 @@
                                          'Should be dynamically set at upload time.'))
     data_name = String(required=True, fulltextindexed=True,
                        description=_('name of the file. Should be dynamically set at upload time.'))
-    data_sha1hex = String(maxsize=40,
-                          description=_('SHA1 sum of the file. May be set at upload time.'),
-                          __permissions__={'read': ('managers', 'users', 'guests'),
-                                           'add': (),
-                                           'update': (),
-                                           })
+    data_hash = String(
+        maxsize=256,  # max len of currently available hash alg + prefix is 140
+        description=_('hash of the file. May be set at upload time.'),
+        __permissions__={'read': ('managers', 'users', 'guests'),
+                         'add': (),
+                         'update': ()})
     description = RichString(fulltextindexed=True, internationalizable=True,
                              default_format='text/rest')
 
diff --git a/cubicweb_file/site_cubicweb.py b/cubicweb_file/site_cubicweb.py
--- a/cubicweb_file/site_cubicweb.py
+++ b/cubicweb_file/site_cubicweb.py
@@ -1,3 +1,5 @@
+import hashlib
+
 options = (
     ('image-max-size',
      {'type': 'string',
@@ -22,10 +24,19 @@
       'level': 2,
       'group': 'file'
       }),
-    ('compute-sha1hex',
+    ('compute-hash',
      {'type': 'yn',
       'default': False,
-      'help': 'compute sha1 hex digest of your files',
+      'help': 'compute hash digest of your files',
+      'group': 'file',
+      'level': 2,
+      }),
+    ('hash-algorithm',
+     {'type': 'choice',
+      'default': 'sha256',
+      'choices': list(hashlib.algorithms_available),
+      'help': ('hash algorithm used to compute hash of your files (%s)' %
+               ', '.join(sorted(hashlib.algorithms_available))),
       'group': 'file',
       'level': 2,
       }),
diff --git a/test/unittest_file.py b/test/unittest_file.py
--- a/test/unittest_file.py
+++ b/test/unittest_file.py
@@ -67,47 +67,104 @@
             self.assertRaises(NoSelectableObject, fobj.view, 'image')
             self.assertRaises(NoSelectableObject, fobj.view, 'album')
 
-    def test_sha1hex(self):
+    def test_hash(self):
         with self.admin_access.client_cnx() as cnx:
             self.create_user(cnx, login=u'simpleuser')
             cnx.commit()
         with self.new_access(u'simpleuser').client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             obj = cnx.create_entity('File', data_name=u"myfile.pdf",
                                     data=Binary(b"xxx"),
                                     data_format=self.mime_type)
             cnx.commit()
             self.assertEqual(
-                'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                obj.compute_sha1hex())
+                'cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                obj.compute_hash())
             self.assertEqual(
-                'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                obj.data_sha1hex)  # can read
+                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                obj.data_hash)  # can read
             with self.assertRaises(Unauthorized):
                 # write is forbiden
-                obj.cw_set(data_sha1hex=u'1234')
+                obj.cw_set(data_hash=u'1234')
                 cnx.commit()
             obj.cw_set(data=Binary(b'zzz'))
             obj.cw_clear_all_caches()
             self.assertEqual(
-                '40fa37ec00c761c7dbb6ebdee6d4a260b922f5f4',
-                obj.data_sha1hex)
+                '{sha256}17f165d5a5ba695f27c023a83aa2b3463e23810e360b7517127e90161eebabda',  # noqa
+                obj.data_hash)
             with self.assertRaises(Unauthorized):
                 cnx.create_entity('File', data_name=u'anotherfile.pdf',
                                   data=Binary(b'yyy'),
                                   data_format=self.mime_type,
-                                  data_sha1hex=u'deadbeef')
+                                  data_hash=u'deadbeef')
                 cnx.commit()
 
-    def test_sha1hex_nodata(self):
+    def test_hash_nodata(self):
         with self.admin_access.client_cnx() as cnx:
             with cnx.deny_all_hooks_but('metadata'):
-                cnx.vreg.config['compute-sha1hex'] = 1
+                cnx.vreg.config['compute-hash'] = 1
                 obj = cnx.create_entity('File')
                 cnx.commit()
             self.assertEqual(None, obj.data)
-            self.assertEqual(None, obj.data_sha1hex)
-            self.assertEqual(None, obj.compute_sha1hex())
+            self.assertEqual(None, obj.data_hash)
+            self.assertEqual(None, obj.compute_hash())
+
+    def test_checkhash(self):
+        with self.admin_access.client_cnx() as cnx:
+            self.create_user(cnx, login=u'simpleuser')
+            cnx.commit()
+        with self.new_access(u'simpleuser').client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'md5'
+            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
+                                    data=Binary(b"xxx"),
+                                    data_format=self.mime_type)
+            cnx.commit()
+            self.assertEqual('f561aaf6ef0bf14d4208bb46a4ccb3ad',
+                             obj.compute_hash())
+            self.assertTrue(obj.check_hash())
+
+            obj.cw_clear_all_caches()
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            self.assertTrue(obj.check_hash())
+
+            with cnx.allow_all_hooks_but('hash'):
+                obj.cw_set(data=Binary(b'toto'))
+                cnx.commit()
+
+            # stored hash should still be the old one ('xxx' using md5)
+            self.assertEqual('{md5}f561aaf6ef0bf14d4208bb46a4ccb3ad',
+                             obj.data_hash)
+            self.assertEqual('0b9c2625dc21ef05f6ad4ddf47c5f203837aa32c',
+                             obj.compute_hash())
+            self.assertFalse(obj.check_hash())
+
+    def test_checkhash_bwcompat(self):
+        with self.admin_access.client_cnx() as cnx:
+            self.create_user(cnx, login=u'simpleuser')
+            cnx.commit()
+        with self.new_access(u'simpleuser').client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
+                                    data=Binary(b"xxx"),
+                                    data_format=self.mime_type)
+            cnx.commit()
+            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
+                             obj.compute_hash())
+            self.assertTrue(obj.check_hash())
+
+            cnx.system_sql(
+                'UPDATE cw_file SET cw_data_hash=%(hash)s '
+                'WHERE cw_data_name=%(name)s',
+                {'hash': 'b60d121b438a380c343d5ec3c2037564b82ffef3',
+                 'name': 'myfile.pdf'})
+            obj.cw_clear_all_caches()
+            # note the absence of the hash algo "header" here:
+            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
+                             obj.data_hash)
+            self.assertTrue(obj.check_hash())
 
 
 class ImageTC(CubicWebTC):
diff --git a/test/unittest_hooks.py b/test/unittest_hooks.py
--- a/test/unittest_hooks.py
+++ b/test/unittest_hooks.py
@@ -95,41 +95,74 @@
             self.assertEqual(pilimg.size, (20, 20))
 
 
-class Sha1TC(CubicWebTC):
+class HashTC(CubicWebTC):
 
-    def test_init_sha1(self):
+    def test_init_hash(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 0
+            cnx.vreg.config['compute-hash'] = 0
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
-            self.assertEqual(None, fobj.data_sha1hex)
+            self.assertEqual(None, fobj.data_hash)
 
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
             self.assertEqual(
-                u'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                fobj.data_sha1hex)
+                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                fobj.data_hash)
 
     def test_modify_data(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
             fobj.cw_set(data=Binary(b'yyy'))
             self.assertEqual(
-                u'186154712b2d5f6791d85b9a0987b98fa231779c',
-                fobj.data_sha1hex)
+                '{sha256}f2afd1cacb5441a5e65a7a460a5f9898b7b98b08aa6323a2e53c8b9a9686cd86',  # noqa
+                fobj.data_hash)
 
-    def test_manual_set_sha1_forbidden(self):
+    def test_manual_set_hash_forbidden(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 0
+            cnx.vreg.config['compute-hash'] = 0
             with self.assertRaises(Unauthorized):
                 cnx.create_entity(
                     'File', data_name=u"foo.txt", data=Binary(b"xxx"),
-                    data_sha1hex=u'0'*40)
+                    data_hash=u'0'*40)
                 cnx.commit()
 
+    def test_hash_algos(self):
+        with self.admin_access.client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
+            fobj = cnx.create_entity(
+                'File', data_name=u"foo.txt", data=Binary(b"xxx")).eid
+            cnx.commit()
+        with self.admin_access.client_cnx() as cnx:
+            fobj = cnx.find('File', eid=fobj).one()
+
+            cnx.vreg.config['hash-algorithm'] = 'md5'
+            fobj.cw_set(data=Binary(b'md5'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{md5}1bc29b36f623ba82aaf6724fd3b16718',
+                fobj.data_hash)
+
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            fobj.cw_set(data=Binary(b'sha1'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{sha1}415ab40ae9b7cc4e66d6769cb2c08106e8293b48',
+                fobj.data_hash)
+
+            cnx.vreg.config['hash-algorithm'] = 'sha512'
+            fobj.cw_set(data=Binary(b'sha512'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{sha512}1f9720f871674c18e5fecff61d92c1355cd4bfac25699fb7ddfe7717c9669b4d085193982402156122dfaa706885fd64741704649795c65b2a5bdec40347e28a',  # noqa
+                fobj.data_hash)
+
 
 if __name__ == '__main__':
     from unittest import main


More information about the cubicweb-devel mailing list