[PATCH file v2] make hash algorithm configurable

Denis Laxalde denis.laxalde at logilab.fr
Wed Jun 13 09:35:12 CEST 2018


David Douard a écrit :
> # HG changeset patch
> # User David Douard <david.douard at logilab.fr>
> # Date 1528728464 -7200
> #      Mon Jun 11 16:47:44 2018 +0200
> # Node ID 262eee49761ef29b34e6d35b4b0da3a305a0c15e
> # Parent  1ceaa1c57915d20320f75de75bc051d961b3270a
> make hash algorithm configurable

I quickly went through this patch and it seems that a couple of
issues/questions I raised while reviewing the first version have not
been addressed. Can you double-check and send another version? Thanks :)

> - replace File.data_sha1hex by File.data_hash
> - update hooks accordingly
> - store the hash with a "header" ("{SHA1}", "{MD5}" etc.)
> - deprecate 'File.compute_sha1hex()' method
> - add a 'File.compute_hash()' method
> - add an helper 'File.check_hash()' method

Considering how these "compute_hash" and "check_hash" methods access
attributes of the entity (e.g. the "if self.data_hash" short-circuit or
the "value" keyword), I wonder if these wouldn't better defined as plain
functions. For instance, "compute_hash" (resp. "check_hash") would be
given the unhashed (resp. hashed) value. In particular, it'd make it
clearer that no-db access is actually required for these computations.

> - rename 'compute-sha1hex' config option as 'compute-hash'
> - add a 'hash-algorithm' config option
> - add a migration script
> 
> diff --git a/cubicweb_file/entities.py b/cubicweb_file/entities.py
> --- a/cubicweb_file/entities.py
> +++ b/cubicweb_file/entities.py
> @@ -7,7 +7,7 @@
>  __docformat__ = "restructuredtext en"
>  from warnings import warn
>  import tempfile
> -
> +import re
>  import os
>  from os.path import join, splitext, exists, isfile, isabs
>  import hashlib
> @@ -94,11 +94,31 @@
>              if rpath is not None:
>                  return self._cw.data_url(iconfile)
>  
> -    def compute_sha1hex(self, value=None):
> +    def check_hash(self, value=None):
> +        if self.data_hash:
> +            m = re.match('{(?P<alg>[A-Za-z0-9]+)}(?P<hash>.+)', self.data_hash)
> +            if m:
> +                alg = m.group('alg')
> +                hashhex = m.group('hash')
> +            else:  # bw compat, used to be sha1
> +                alg = 'sha1'
> +                hashhex = self.data_hash
> +            comphash = self.compute_hash(alg=alg)
> +            return comphash == hashhex
> +        return False  # ?or True?
> +
> +    def compute_hash(self, value=None, alg=None):
>          if value is None and self.data is not None:
>              value = self.data.getvalue()
>          if value is not None:
> -            return text_type(hashlib.sha1(value).hexdigest())
> +            if alg is None:
> +                alg = self._cw.vreg.config['hash-algorithm']
> +            hasher = hashlib.new(alg, value)
> +            return text_type(hasher.hexdigest())
> +
> +    @deprecated('[file 2.1] use entity.compute_hash() instead')
> +    def compute_sha1hex(self, value=None):
> +        return self.compute_hash(value, alg='sha1')
>  
>      @deprecated('[file 1.9] use entity.cw_adapt_to("IDownloadable").download_url() instead')
>      def download_url(self):
> @@ -192,12 +212,9 @@
>      def _thumbnail_path(self):
>          cachedir = thumb_cache_dir(self._cw.vreg.config)
>          if cachedir:
> -            if self.entity.data_sha1hex:
> -                sha1 = self.entity.data_sha1hex
> -            else:
> -                sha1 = self.entity.compute_sha1hex()
> +            hashhex = self.entity.compute_hash()
>              size = self._cw.vreg.config['image-thumb-size']
> -            return join(cachedir, '%s_%s.png' % (sha1, size))
> +            return join(cachedir, '%s_%s.png' % (hashhex, size))
>          return None
>  
>      def thumbnail_file_name(self):
> diff --git a/cubicweb_file/hooks.py b/cubicweb_file/hooks.py
> --- a/cubicweb_file/hooks.py
> +++ b/cubicweb_file/hooks.py
> @@ -22,7 +22,8 @@
>      __regid__ = 'updatefilehook'
>      __select__ = hook.Hook.__select__ & is_instance('File')
>      events = ('before_add_entity', 'before_update_entity',)
> -    order = -1 # should be run before other hooks
> +    order = -1  # should be run before other hooks
> +    category = 'hash'
>  
>      def __call__(self):
>          edited = self.entity.cw_edited
> @@ -38,11 +39,13 @@
>                      # (original image will be stored)
>                      pass
>  
> -            if self._cw.vreg.config['compute-sha1hex']:
> +            if self._cw.vreg.config['compute-hash']:
>                  data = edited['data']
>                  if data is not None:
> -                    data = self.entity.compute_sha1hex(data.getvalue())
> -                edited['data_sha1hex'] = data
> +                    hashdata = self.entity.compute_hash(data.getvalue())
> +                    edited['data_hash'] = '{%s}%s' % (
> +                        self._cw.vreg.config['hash-algorithm'],
> +                        hashdata)
>  
>              # thumbnail cache invalidation
>              if 'update' in self.event and 'data' in edited:
> diff --git a/cubicweb_file/migration/2.1.0_Any.py b/cubicweb_file/migration/2.1.0_Any.py
> new file mode 100644
> --- /dev/null
> +++ b/cubicweb_file/migration/2.1.0_Any.py
> @@ -0,0 +1,21 @@
> +
> +subclasses = False
> +is_rel = 'is'
> +
> +if confirm('Also handle entity types that inherit from File?'):
> +    subclasses = True
> +    is_rel = 'is_instance_of'
> +
> +add_attribute('File', 'data_hash')
> +if subclasses:
> +    for etype in schema['File'].specialized_by(recursive=True):
> +        add_attribute(etype.type, 'data_hash')
> +
> +rql('SET X data_hash "{SHA1}"+H WHERE X %(rel)s File, '
> +    'X data_sha1hex H, '
> +    'NOT X data_sha1hex NULL' % is_rel)
> +
> +drop_attribute('File', 'data_sha1hex')
> +if subclasses:
> +    for etype in schema['File'].specialized_by(recursive=True):
> +        drop_attribute(etype.type, 'data_sha1hex')
> diff --git a/cubicweb_file/schema.py b/cubicweb_file/schema.py
> --- a/cubicweb_file/schema.py
> +++ b/cubicweb_file/schema.py
> @@ -19,12 +19,12 @@
>                                           'Should be dynamically set at upload time.'))
>      data_name = String(required=True, fulltextindexed=True,
>                         description=_('name of the file. Should be dynamically set at upload time.'))
> -    data_sha1hex = String(maxsize=40,
> -                          description=_('SHA1 sum of the file. May be set at upload time.'),
> -                          __permissions__={'read': ('managers', 'users', 'guests'),
> -                                           'add': (),
> -                                           'update': (),
> -                                           })
> +    data_hash = String(
> +        maxsize=256,  # max len of currently available hash alg + prefix is 140
> +        description=_('hash of the file. May be set at upload time.'),
> +        __permissions__={'read': ('managers', 'users', 'guests'),
> +                         'add': (),
> +                         'update': ()})
>      description = RichString(fulltextindexed=True, internationalizable=True,
>                               default_format='text/rest')
>  
> diff --git a/cubicweb_file/site_cubicweb.py b/cubicweb_file/site_cubicweb.py
> --- a/cubicweb_file/site_cubicweb.py
> +++ b/cubicweb_file/site_cubicweb.py
> @@ -1,3 +1,5 @@
> +import hashlib
> +
>  options = (
>      ('image-max-size',
>       {'type': 'string',
> @@ -22,10 +24,19 @@
>        'level': 2,
>        'group': 'file'
>        }),
> -    ('compute-sha1hex',
> +    ('compute-hash',
>       {'type': 'yn',
>        'default': False,
> -      'help': 'compute sha1 hex digest of your files',
> +      'help': 'compute hash digest of your files',
> +      'group': 'file',
> +      'level': 2,
> +      }),
> +    ('hash-algorithm',
> +     {'type': 'choice',
> +      'default': 'sha256',
> +      'choices': list(hashlib.algorithms_available),
> +      'help': ('hash algorithm used to compute hash of your files (%s)' %
> +               ', '.join(sorted(hashlib.algorithms_available))),
>        'group': 'file',
>        'level': 2,
>        }),
> diff --git a/test/unittest_file.py b/test/unittest_file.py
> --- a/test/unittest_file.py
> +++ b/test/unittest_file.py
> @@ -67,47 +67,104 @@
>              self.assertRaises(NoSelectableObject, fobj.view, 'image')
>              self.assertRaises(NoSelectableObject, fobj.view, 'album')
>  
> -    def test_sha1hex(self):
> +    def test_hash(self):
>          with self.admin_access.client_cnx() as cnx:
>              self.create_user(cnx, login=u'simpleuser')
>              cnx.commit()
>          with self.new_access(u'simpleuser').client_cnx() as cnx:
> -            cnx.vreg.config['compute-sha1hex'] = 1
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'sha256'
>              obj = cnx.create_entity('File', data_name=u"myfile.pdf",
>                                      data=Binary(b"xxx"),
>                                      data_format=self.mime_type)
>              cnx.commit()
>              self.assertEqual(
> -                'b60d121b438a380c343d5ec3c2037564b82ffef3',
> -                obj.compute_sha1hex())
> +                'cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
> +                obj.compute_hash())
>              self.assertEqual(
> -                'b60d121b438a380c343d5ec3c2037564b82ffef3',
> -                obj.data_sha1hex)  # can read
> +                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
> +                obj.data_hash)  # can read
>              with self.assertRaises(Unauthorized):
>                  # write is forbiden
> -                obj.cw_set(data_sha1hex=u'1234')
> +                obj.cw_set(data_hash=u'1234')
>                  cnx.commit()
>              obj.cw_set(data=Binary(b'zzz'))
>              obj.cw_clear_all_caches()
>              self.assertEqual(
> -                '40fa37ec00c761c7dbb6ebdee6d4a260b922f5f4',
> -                obj.data_sha1hex)
> +                '{sha256}17f165d5a5ba695f27c023a83aa2b3463e23810e360b7517127e90161eebabda',  # noqa
> +                obj.data_hash)
>              with self.assertRaises(Unauthorized):
>                  cnx.create_entity('File', data_name=u'anotherfile.pdf',
>                                    data=Binary(b'yyy'),
>                                    data_format=self.mime_type,
> -                                  data_sha1hex=u'deadbeef')
> +                                  data_hash=u'deadbeef')
>                  cnx.commit()
>  
> -    def test_sha1hex_nodata(self):
> +    def test_hash_nodata(self):
>          with self.admin_access.client_cnx() as cnx:
>              with cnx.deny_all_hooks_but('metadata'):
> -                cnx.vreg.config['compute-sha1hex'] = 1
> +                cnx.vreg.config['compute-hash'] = 1
>                  obj = cnx.create_entity('File')
>                  cnx.commit()
>              self.assertEqual(None, obj.data)
> -            self.assertEqual(None, obj.data_sha1hex)
> -            self.assertEqual(None, obj.compute_sha1hex())
> +            self.assertEqual(None, obj.data_hash)
> +            self.assertEqual(None, obj.compute_hash())
> +
> +    def test_checkhash(self):
> +        with self.admin_access.client_cnx() as cnx:
> +            self.create_user(cnx, login=u'simpleuser')
> +            cnx.commit()
> +        with self.new_access(u'simpleuser').client_cnx() as cnx:
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'md5'
> +            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
> +                                    data=Binary(b"xxx"),
> +                                    data_format=self.mime_type)
> +            cnx.commit()
> +            self.assertEqual('f561aaf6ef0bf14d4208bb46a4ccb3ad',
> +                             obj.compute_hash())
> +            self.assertTrue(obj.check_hash())
> +
> +            obj.cw_clear_all_caches()
> +            cnx.vreg.config['hash-algorithm'] = 'sha1'
> +            self.assertTrue(obj.check_hash())
> +
> +            with cnx.allow_all_hooks_but('hash'):
> +                obj.cw_set(data=Binary(b'toto'))
> +                cnx.commit()
> +
> +            # stored hash should still be the old one ('xxx' using md5)
> +            self.assertEqual('{md5}f561aaf6ef0bf14d4208bb46a4ccb3ad',
> +                             obj.data_hash)
> +            self.assertEqual('0b9c2625dc21ef05f6ad4ddf47c5f203837aa32c',
> +                             obj.compute_hash())
> +            self.assertFalse(obj.check_hash())
> +
> +    def test_checkhash_bwcompat(self):
> +        with self.admin_access.client_cnx() as cnx:
> +            self.create_user(cnx, login=u'simpleuser')
> +            cnx.commit()
> +        with self.new_access(u'simpleuser').client_cnx() as cnx:
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'sha1'
> +            obj = cnx.create_entity('File', data_name=u"myfile.pdf",
> +                                    data=Binary(b"xxx"),
> +                                    data_format=self.mime_type)
> +            cnx.commit()
> +            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
> +                             obj.compute_hash())
> +            self.assertTrue(obj.check_hash())
> +
> +            cnx.system_sql(
> +                'UPDATE cw_file SET cw_data_hash=%(hash)s '
> +                'WHERE cw_data_name=%(name)s',
> +                {'hash': 'b60d121b438a380c343d5ec3c2037564b82ffef3',
> +                 'name': 'myfile.pdf'})
> +            obj.cw_clear_all_caches()
> +            # note the absence of the hash algo "header" here:
> +            self.assertEqual('b60d121b438a380c343d5ec3c2037564b82ffef3',
> +                             obj.data_hash)
> +            self.assertTrue(obj.check_hash())
>  
>  
>  class ImageTC(CubicWebTC):
> diff --git a/test/unittest_hooks.py b/test/unittest_hooks.py
> --- a/test/unittest_hooks.py
> +++ b/test/unittest_hooks.py
> @@ -95,41 +95,74 @@
>              self.assertEqual(pilimg.size, (20, 20))
>  
>  
> -class Sha1TC(CubicWebTC):
> +class HashTC(CubicWebTC):
>  
> -    def test_init_sha1(self):
> +    def test_init_hash(self):
>          with self.admin_access.client_cnx() as cnx:
> -            cnx.vreg.config['compute-sha1hex'] = 0
> +            cnx.vreg.config['compute-hash'] = 0
>              fobj = cnx.create_entity(
>                  'File', data_name=u"foo.txt", data=Binary(b"xxx"))
> -            self.assertEqual(None, fobj.data_sha1hex)
> +            self.assertEqual(None, fobj.data_hash)
>  
> -            cnx.vreg.config['compute-sha1hex'] = 1
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'sha256'
>              fobj = cnx.create_entity(
>                  'File', data_name=u"foo.txt", data=Binary(b"xxx"))
>              self.assertEqual(
> -                u'b60d121b438a380c343d5ec3c2037564b82ffef3',
> -                fobj.data_sha1hex)
> +                '{sha256}cd2eb0837c9b4c962c22d2ff8b5441b7b45805887f051d39bf133b583baf6860',  # noqa
> +                fobj.data_hash)
>  
>      def test_modify_data(self):
>          with self.admin_access.client_cnx() as cnx:
> -            cnx.vreg.config['compute-sha1hex'] = 1
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'sha256'
>              fobj = cnx.create_entity(
>                  'File', data_name=u"foo.txt", data=Binary(b"xxx"))
>              fobj.cw_set(data=Binary(b'yyy'))
>              self.assertEqual(
> -                u'186154712b2d5f6791d85b9a0987b98fa231779c',
> -                fobj.data_sha1hex)
> +                '{sha256}f2afd1cacb5441a5e65a7a460a5f9898b7b98b08aa6323a2e53c8b9a9686cd86',  # noqa
> +                fobj.data_hash)
>  
> -    def test_manual_set_sha1_forbidden(self):
> +    def test_manual_set_hash_forbidden(self):
>          with self.admin_access.client_cnx() as cnx:
> -            cnx.vreg.config['compute-sha1hex'] = 0
> +            cnx.vreg.config['compute-hash'] = 0
>              with self.assertRaises(Unauthorized):
>                  cnx.create_entity(
>                      'File', data_name=u"foo.txt", data=Binary(b"xxx"),
> -                    data_sha1hex=u'0'*40)
> +                    data_hash=u'0'*40)
>                  cnx.commit()
>  
> +    def test_hash_algos(self):
> +        with self.admin_access.client_cnx() as cnx:
> +            cnx.vreg.config['compute-hash'] = 1
> +            cnx.vreg.config['hash-algorithm'] = 'sha256'
> +            fobj = cnx.create_entity(
> +                'File', data_name=u"foo.txt", data=Binary(b"xxx")).eid
> +            cnx.commit()
> +        with self.admin_access.client_cnx() as cnx:
> +            fobj = cnx.find('File', eid=fobj).one()
> +
> +            cnx.vreg.config['hash-algorithm'] = 'md5'
> +            fobj.cw_set(data=Binary(b'md5'))
> +            fobj.cw_clear_all_caches()
> +            self.assertEqual(
> +                '{md5}1bc29b36f623ba82aaf6724fd3b16718',
> +                fobj.data_hash)
> +
> +            cnx.vreg.config['hash-algorithm'] = 'sha1'
> +            fobj.cw_set(data=Binary(b'sha1'))
> +            fobj.cw_clear_all_caches()
> +            self.assertEqual(
> +                '{sha1}415ab40ae9b7cc4e66d6769cb2c08106e8293b48',
> +                fobj.data_hash)
> +
> +            cnx.vreg.config['hash-algorithm'] = 'sha512'
> +            fobj.cw_set(data=Binary(b'sha512'))
> +            fobj.cw_clear_all_caches()
> +            self.assertEqual(
> +                '{sha512}1f9720f871674c18e5fecff61d92c1355cd4bfac25699fb7ddfe7717c9669b4d085193982402156122dfaa706885fd64741704649795c65b2a5bdec40347e28a',  # noqa
> +                fobj.data_hash)
> +
>  
>  if __name__ == '__main__':
>      from unittest import main
> 


More information about the cubicweb-devel mailing list