[PATCH 3 of 3 file] make hash algorithm configurable

David Douard david.douard at logilab.fr
Tue Jun 12 12:00:51 CEST 2018


# HG changeset patch
# User David Douard <david.douard at logilab.fr>
# Date 1528728464 -7200
#      Mon Jun 11 16:47:44 2018 +0200
# Node ID 065c601bbf25128a279f66c4713251835a5dac9d
# Parent  1ceaa1c57915d20320f75de75bc051d961b3270a
make hash algorithm configurable

- replace File.data_sha1hex by File.data_hash
- update hooks accordingly
- store the hash with a "header" ("{SHA1}", "{MD5}" etc.)
- deprecate 'File.compute_sha1hex()' method
- add a 'File.compute_hash()' method
- add an helper 'File.check_hash()' method
- rename 'compute-sha1hex' config option as 'compute-hash'
- add a 'hash-algorithm' config option
- add a migration script

diff --git a/cubicweb_file/entities.py b/cubicweb_file/entities.py
--- a/cubicweb_file/entities.py
+++ b/cubicweb_file/entities.py
@@ -1,13 +1,13 @@
 """entity classes for File entities
 
 :organization: Logilab
-:copyright: 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+:copyright: 2003-2018 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 :contact: http://www.logilab.fr/ -- mailto:contact at logilab.fr
 """
-__docformat__ = "restructuredtext en"
+
 from warnings import warn
 import tempfile
-
+import re
 import os
 from os.path import join, splitext, exists, isfile, isabs
 import hashlib
@@ -15,7 +15,6 @@
 from six import text_type
 
 from logilab.mtconverter import guess_mimetype_and_encoding
-from logilab.common.decorators import cachedproperty
 from logilab.common.deprecation import deprecated
 
 from cubicweb import Binary
@@ -94,11 +93,31 @@
             if rpath is not None:
                 return self._cw.data_url(iconfile)
 
-    def compute_sha1hex(self, value=None):
+    def check_hash(self, value=None):
+        if self.data_hash:
+            m = re.match('{(?P<alg>[A-Za-z0-9]+)}(?P<hash>.+)', self.data_hash)
+            if m:
+                alg = m.group('alg')
+                hashhex = m.group('hash')
+            else:  # bw compat, used to be sha1
+                alg = 'sha1'
+                hashhex = self.data_hash
+            comphash = self.compute_hash(alg=alg)
+            return comphash == hashhex
+        return False  # ?or True?
+
+    def compute_hash(self, value=None, alg=None):
         if value is None and self.data is not None:
             value = self.data.getvalue()
         if value is not None:
-            return text_type(hashlib.sha1(value).hexdigest())
+            if alg is None:
+                alg = self._cw.vreg.config['hash-algorithm']
+            hasher = hashlib.new(alg, value)
+            return text_type(hasher.hexdigest())
+
+    @deprecated('[file 2.1] use entity.compute_hash() instead')
+    def compute_sha1hex(self, value=None):
+        return self.compute_hash(value, alg='sha1')
 
     @deprecated('[file 1.9] use entity.cw_adapt_to("IDownloadable").download_url() instead')
     def download_url(self):
@@ -182,6 +201,7 @@
             return
     return directory
 
+
 # This adapter should not be considered as an official API to manage
 # thumbnails for any kind of document/content. A proper and generic
 # API might be proposed in the future
@@ -192,12 +212,9 @@
     def _thumbnail_path(self):
         cachedir = thumb_cache_dir(self._cw.vreg.config)
         if cachedir:
-            if self.entity.data_sha1hex:
-                sha1 = self.entity.data_sha1hex
-            else:
-                sha1 = self.entity.compute_sha1hex()
+            hashhex = self.entity.compute_hash()
             size = self._cw.vreg.config['image-thumb-size']
-            return join(cachedir, '%s_%s.png' % (sha1, size))
+            return join(cachedir, '%s_%s.png' % (hashhex, size))
         return None
 
     def thumbnail_file_name(self):
@@ -247,7 +264,8 @@
         return None
 
 
-class UnResizeable(Exception): pass
+class UnResizeable(Exception):
+    pass
 
 
 class IImageAdapter(EntityAdapter):
@@ -300,4 +318,3 @@
         ithumbnail = self.entity.cw_adapt_to('IThumbnail')
         stream.filename = ithumbnail.thumbnail_file_name()
         return stream
-
diff --git a/cubicweb_file/hooks.py b/cubicweb_file/hooks.py
--- a/cubicweb_file/hooks.py
+++ b/cubicweb_file/hooks.py
@@ -1,14 +1,12 @@
 """File related hooks
 
 :organization: Logilab
-:copyright: 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+:copyright: 2003-2018 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
 :contact: http://www.logilab.fr/ -- mailto:contact at logilab.fr
 """
-__docformat__ = "restructuredtext en"
 
 import os
 
-from cubicweb import ValidationError
 from cubicweb.server import hook
 from cubicweb.sobjects.notification import ContentAddedView
 from cubicweb.predicates import is_instance
@@ -22,7 +20,8 @@
     __regid__ = 'updatefilehook'
     __select__ = hook.Hook.__select__ & is_instance('File')
     events = ('before_add_entity', 'before_update_entity',)
-    order = -1 # should be run before other hooks
+    order = -1  # should be run before other hooks
+    category = 'hash'
 
     def __call__(self):
         edited = self.entity.cw_edited
@@ -38,11 +37,13 @@
                     # (original image will be stored)
                     pass
 
-            if self._cw.vreg.config['compute-sha1hex']:
+            if self._cw.vreg.config['compute-hash']:
                 data = edited['data']
                 if data is not None:
-                    data = self.entity.compute_sha1hex(data.getvalue())
-                edited['data_sha1hex'] = data
+                    hashdata = self.entity.compute_hash(data.getvalue())
+                    edited['data_hash'] = '{%s}%s' % (
+                        self._cw.vreg.config['hash-algorithm'],
+                        hashdata)
 
             # thumbnail cache invalidation
             if 'update' in self.event and 'data' in edited:
@@ -54,8 +55,10 @@
                     try:
                         os.unlink(thumbpath)
                     except Exception as exc:
-                        self.warning('could not invalidate thumbnail file `%s` (cause: %s)',
-                                     thumbpath, exc)
+                        self.warning(
+                            'could not invalidate thumbnail file `%s` '
+                            '(cause: %s)',
+                            thumbpath, exc)
 
 
 class FileAddedView(ContentAddedView):
diff --git a/cubicweb_file/migration/2.1.0_Any.py b/cubicweb_file/migration/2.1.0_Any.py
new file mode 100644
--- /dev/null
+++ b/cubicweb_file/migration/2.1.0_Any.py
@@ -0,0 +1,21 @@
+
+subclasses = False
+is_rel = 'is'
+
+if confirm('Also handle entity types that inherit from File?'):
+    subclasses = True
+    is_rel = 'is_instance_of'
+
+add_attribute('File', 'data_hash')
+if subclasses:
+    for etype in schema['File'].specialized_by(recursive=True):
+        add_attribute(etype.type, 'data_hash')
+
+rql('SET X data_hash "{SHA1}"+H WHERE X %(rel)s File, '
+    'X data_sha1hex H, '
+    'NOT X data_sha1hex NULL' % is_rel)
+
+drop_attribute('File', 'data_sha1hex')
+if subclasses:
+    for etype in schema['File'].specialized_by(recursive=True):
+        drop_attribute(etype.type, 'data_sha1hex')
diff --git a/cubicweb_file/schema.py b/cubicweb_file/schema.py
--- a/cubicweb_file/schema.py
+++ b/cubicweb_file/schema.py
@@ -1,9 +1,6 @@
 # pylint: disable-msg=E0611,F0401
 
-try:
-    from cubicweb import _
-except ImportError:
-    _ = unicode
+from cubicweb import _
 
 from yams.buildobjs import EntityType, String, Bytes, RichString
 
@@ -12,19 +9,23 @@
     """a downloadable file which may contains binary data"""
     title = String(fulltextindexed=True, maxsize=256)
     data = Bytes(required=True, description=_('file to upload'))
-    data_format = String(required=True, maxsize=128,
-                         description=_('MIME type of the file. Should be dynamically set at upload time.'))
-    data_encoding = String(maxsize=32,
-                           description=_('encoding of the file when it applies (e.g. text). '
-                                         'Should be dynamically set at upload time.'))
-    data_name = String(required=True, fulltextindexed=True,
-                       description=_('name of the file. Should be dynamically set at upload time.'))
-    data_sha1hex = String(maxsize=40,
-                          description=_('SHA1 sum of the file. May be set at upload time.'),
-                          __permissions__={'read': ('managers', 'users', 'guests'),
-                                           'add': (),
-                                           'update': (),
-                                           })
+    data_format = String(
+        required=True, maxsize=128,
+        description=_('MIME type of the file. Should be dynamically set '
+                      'at upload time.'))
+    data_encoding = String(
+        maxsize=32,
+        description=_('encoding of the file when it applies (e.g. text). '
+                      'Should be dynamically set at upload time.'))
+    data_name = String(
+        required=True, fulltextindexed=True,
+        description=_('name of the file. Should be dynamically set '
+                      'at upload time.'))
+    data_hash = String(
+        maxsize=256,  # max len of currently available hash alg + prefix is 140
+        description=_('hash of the file. May be set at upload time.'),
+        __permissions__={'read': ('managers', 'users', 'guests'),
+                         'add': (),
+                         'update': ()})
     description = RichString(fulltextindexed=True, internationalizable=True,
                              default_format='text/rest')
-
diff --git a/cubicweb_file/site_cubicweb.py b/cubicweb_file/site_cubicweb.py
--- a/cubicweb_file/site_cubicweb.py
+++ b/cubicweb_file/site_cubicweb.py
@@ -1,3 +1,5 @@
+import hashlib
+
 options = (
     ('image-max-size',
      {'type': 'string',
@@ -22,10 +24,19 @@
       'level': 2,
       'group': 'file'
       }),
-    ('compute-sha1hex',
+    ('compute-hash',
      {'type': 'yn',
       'default': False,
-      'help': 'compute sha1 hex digest of your files',
+      'help': 'compute hash digest of your files',
+      'group': 'file',
+      'level': 2,
+      }),
+    ('hash-algorithm',
+     {'type': 'choice',
+      'default': 'sha256',
+      'choices': list(hashlib.algorithms_available),
+      'help': ('hash algorithm used to compute hash of your files (%s)' %
+               ', '.join(sorted(hashlib.algorithms_available))),
       'group': 'file',
       'level': 2,
       }),
diff --git a/test/unittest_file.py b/test/unittest_file.py
--- a/test/unittest_file.py
+++ b/test/unittest_file.py
@@ -67,47 +67,104 @@
             self.assertRaises(NoSelectableObject, fobj.view, 'image')
             self.assertRaises(NoSelectableObject, fobj.view, 'album')
 
-    def test_sha1hex(self):
+    def test_hash(self):
         with self.admin_access.client_cnx() as cnx:
             self.create_user(cnx, login=u'simpleuser')
             cnx.commit()
         with self.new_access(u'simpleuser').client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             obj = cnx.create_entity('File', data_name=u"myfile.pdf",
                                     data=Binary(b"xxx"),
                                     data_format=self.mime_type)
             cnx.commit()
             self.assertEqual(
-                'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                obj.compute_sha1hex())
+                'cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                obj.compute_hash())
             self.assertEqual(
-                'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                obj.data_sha1hex)  # can read
+                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                obj.data_hash)  # can read
             with self.assertRaises(Unauthorized):
                 # write is forbiden
-                obj.cw_set(data_sha1hex=u'1234')
+                obj.cw_set(data_hash=u'1234')
                 cnx.commit()
             obj.cw_set(data=Binary(b'zzz'))
             obj.cw_clear_all_caches()
             self.assertEqual(
-                '40fa37ec00c761c7dbb6ebdee6d4a260b922f5f4',
-                obj.data_sha1hex)
+                '{sha256}17f165d5a5ba695f27c023a83aa2b3463e23810e360b7517127e90161eebabda',  # noqa
+                obj.data_hash)
             with self.assertRaises(Unauthorized):
                 cnx.create_entity('File', data_name=u'anotherfile.pdf',
                                   data=Binary(b'yyy'),
                                   data_format=self.mime_type,
-                                  data_sha1hex=u'deadbeef')
+                                  data_hash=u'deadbeef')
                 cnx.commit()
 
-    def test_sha1hex_nodata(self):
+    def test_hash_nodata(self):
         with self.admin_access.client_cnx() as cnx:
             with cnx.deny_all_hooks_but('metadata'):
-                cnx.vreg.config['compute-sha1hex'] = 1
+                cnx.vreg.config['compute-hash'] = 1
                 obj = cnx.create_entity('File')
                 cnx.commit()
             self.assertEqual(None, obj.data)
-            self.assertEqual(None, obj.data_sha1hex)
-            self.assertEqual(None, obj.compute_sha1hex())
+            self.assertEqual(None, obj.data_hash)
+            self.assertEqual(None, obj.compute_hash())
+
+    def test_checkhash(self):
+        with self.admin_access.client_cnx() as cnx:
+            self.create_user(cnx, login=u'simpleuser')
+            cnx.commit()
+        with self.new_access(u'simpleuser').client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'md5'
+            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
+                                    data=Binary(b"xxx"),
+                                    data_format=self.mime_type)
+            cnx.commit()
+            self.assertEqual('f561aaf6ef0bf14d4208bb46a4ccb3ad',
+                             obj.compute_hash())
+            self.assertTrue(obj.check_hash())
+
+            obj.cw_clear_all_caches()
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            self.assertTrue(obj.check_hash())
+
+            with cnx.allow_all_hooks_but('hash'):
+                obj.cw_set(data=Binary(b'toto'))
+                cnx.commit()
+
+            # stored hash should still be the old one ('xxx' using md5)
+            self.assertEqual('{md5}f561aaf6ef0bf14d4208bb46a4ccb3ad',
+                             obj.data_hash)
+            self.assertEqual('0b9c2625dc21ef05f6ad4ddf47c5f203837aa32c',
+                             obj.compute_hash())
+            self.assertFalse(obj.check_hash())
+
+    def test_checkhash_bwcompat(self):
+        with self.admin_access.client_cnx() as cnx:
+            self.create_user(cnx, login=u'simpleuser')
+            cnx.commit()
+        with self.new_access(u'simpleuser').client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
+                                    data=Binary(b"xxx"),
+                                    data_format=self.mime_type)
+            cnx.commit()
+            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
+                             obj.compute_hash())
+            self.assertTrue(obj.check_hash())
+
+            cnx.system_sql(
+                'UPDATE cw_file SET cw_data_hash=%(hash)s '
+                'WHERE cw_data_name=%(name)s',
+                {'hash': 'b60d121b438a380c343d5ec3c2037564b82ffef3',
+                 'name': 'myfile.pdf'})
+            obj.cw_clear_all_caches()
+            # note the absence of the hash algo "header" here:
+            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
+                             obj.data_hash)
+            self.assertTrue(obj.check_hash())
 
 
 class ImageTC(CubicWebTC):
diff --git a/test/unittest_hooks.py b/test/unittest_hooks.py
--- a/test/unittest_hooks.py
+++ b/test/unittest_hooks.py
@@ -95,41 +95,74 @@
             self.assertEqual(pilimg.size, (20, 20))
 
 
-class Sha1TC(CubicWebTC):
+class HashTC(CubicWebTC):
 
-    def test_init_sha1(self):
+    def test_init_hash(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 0
+            cnx.vreg.config['compute-hash'] = 0
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
-            self.assertEqual(None, fobj.data_sha1hex)
+            self.assertEqual(None, fobj.data_hash)
 
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
             self.assertEqual(
-                u'b60d121b438a380c343d5ec3c2037564b82ffef3',
-                fobj.data_sha1hex)
+                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
+                fobj.data_hash)
 
     def test_modify_data(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 1
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
             fobj = cnx.create_entity(
                 'File', data_name=u"foo.txt", data=Binary(b"xxx"))
             fobj.cw_set(data=Binary(b'yyy'))
             self.assertEqual(
-                u'186154712b2d5f6791d85b9a0987b98fa231779c',
-                fobj.data_sha1hex)
+                '{sha256}f2afd1cacb5441a5e65a7a460a5f9898b7b98b08aa6323a2e53c8b9a9686cd86',  # noqa
+                fobj.data_hash)
 
-    def test_manual_set_sha1_forbidden(self):
+    def test_manual_set_hash_forbidden(self):
         with self.admin_access.client_cnx() as cnx:
-            cnx.vreg.config['compute-sha1hex'] = 0
+            cnx.vreg.config['compute-hash'] = 0
             with self.assertRaises(Unauthorized):
                 cnx.create_entity(
                     'File', data_name=u"foo.txt", data=Binary(b"xxx"),
-                    data_sha1hex=u'0'*40)
+                    data_hash=u'0'*40)
                 cnx.commit()
 
+    def test_hash_algos(self):
+        with self.admin_access.client_cnx() as cnx:
+            cnx.vreg.config['compute-hash'] = 1
+            cnx.vreg.config['hash-algorithm'] = 'sha256'
+            fobj = cnx.create_entity(
+                'File', data_name=u"foo.txt", data=Binary(b"xxx")).eid
+            cnx.commit()
+        with self.admin_access.client_cnx() as cnx:
+            fobj = cnx.find('File', eid=fobj).one()
+
+            cnx.vreg.config['hash-algorithm'] = 'md5'
+            fobj.cw_set(data=Binary(b'md5'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{md5}1bc29b36f623ba82aaf6724fd3b16718',
+                fobj.data_hash)
+
+            cnx.vreg.config['hash-algorithm'] = 'sha1'
+            fobj.cw_set(data=Binary(b'sha1'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{sha1}415ab40ae9b7cc4e66d6769cb2c08106e8293b48',
+                fobj.data_hash)
+
+            cnx.vreg.config['hash-algorithm'] = 'sha512'
+            fobj.cw_set(data=Binary(b'sha512'))
+            fobj.cw_clear_all_caches()
+            self.assertEqual(
+                '{sha512}1f9720f871674c18e5fecff61d92c1355cd4bfac25699fb7ddfe7717c9669b4d085193982402156122dfaa706885fd64741704649795c65b2a5bdec40347e28a',  # noqa
+                fobj.data_hash)
+
 
 if __name__ == '__main__':
     from unittest import main


More information about the cubicweb-devel mailing list