[PATCH 3 of 3 file] make hash algorithm configurable

Denis Laxalde denis.laxalde at logilab.fr
Tue Jun 12 17:06:12 CEST 2018


David Douard a écrit :
> # HG changeset patch
> # User David Douard <david.douard at logilab.fr>
> # Date 1528728464 -7200
> #      Mon Jun 11 16:47:44 2018 +0200
> # Node ID 065c601bbf25128a279f66c4713251835a5dac9d
> # Parent  1ceaa1c57915d20320f75de75bc051d961b3270a
> make hash algorithm configurable
> 
> - replace File.data_sha1hex by File.data_hash
> - update hooks accordingly
> - store the hash with a "header" ("{SHA1}", "{MD5}" etc.)
> - deprecate 'File.compute_sha1hex()' method
> - add a 'File.compute_hash()' method
> - add an helper 'File.check_hash()' method
> - rename 'compute-sha1hex' config option as 'compute-hash'
> - add a 'hash-algorithm' config option
> - add a migration script

There are also a bunch of "style" changes. It'd help review if you could
split these from functional changes. (I stopped reading at some point.)

> 
> diff --git a/cubicweb_file/entities.py b/cubicweb_file/entities.py
> --- a/cubicweb_file/entities.py
> +++ b/cubicweb_file/entities.py
> @@ -1,13 +1,13 @@
>  """entity classes for File entities
>  
>  :organization: Logilab
> -:copyright: 2003-2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
> +:copyright: 2003-2018 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
>  :contact: http://www.logilab.fr/ -- mailto:contact at logilab.fr
>  """
> -__docformat__ = "restructuredtext en"
> +
>  from warnings import warn
>  import tempfile
> -
> +import re
>  import os
>  from os.path import join, splitext, exists, isfile, isabs
>  import hashlib
> @@ -15,7 +15,6 @@
>  from six import text_type
>  
>  from logilab.mtconverter import guess_mimetype_and_encoding
> -from logilab.common.decorators import cachedproperty
>  from logilab.common.deprecation import deprecated
>  
>  from cubicweb import Binary
> @@ -94,11 +93,31 @@
>              if rpath is not None:
>                  return self._cw.data_url(iconfile)
>  
> -    def compute_sha1hex(self, value=None):
> +    def check_hash(self, value=None):

"value" is not used in the method.

> +        if self.data_hash:
> +            m = re.match('{(?P<alg>[A-Za-z0-9]+)}(?P<hash>.+)', self.data_hash)
> +            if m:
> +                alg = m.group('alg')
> +                hashhex = m.group('hash')
> +            else:  # bw compat, used to be sha1
> +                alg = 'sha1'
> +                hashhex = self.data_hash
> +            comphash = self.compute_hash(alg=alg)
> +            return comphash == hashhex
> +        return False  # ?or True?

Or raise!

> +
> +    def compute_hash(self, value=None, alg=None):
>          if value is None and self.data is not None:
>              value = self.data.getvalue()
>          if value is not None:
> -            return text_type(hashlib.sha1(value).hexdigest())
> +            if alg is None:
> +                alg = self._cw.vreg.config['hash-algorithm']
> +            hasher = hashlib.new(alg, value)
> +            return text_type(hasher.hexdigest())
> +
> +    @deprecated('[file 2.1] use entity.compute_hash() instead')
> +    def compute_sha1hex(self, value=None):
> +        return self.compute_hash(value, alg='sha1')
>  
>      @deprecated('[file 1.9] use entity.cw_adapt_to("IDownloadable").download_url() instead')
>      def download_url(self):
> @@ -182,6 +201,7 @@
>              return
>      return directory
>  
> +
>  # This adapter should not be considered as an official API to manage
>  # thumbnails for any kind of document/content. A proper and generic
>  # API might be proposed in the future
> @@ -192,12 +212,9 @@
>      def _thumbnail_path(self):
>          cachedir = thumb_cache_dir(self._cw.vreg.config)
>          if cachedir:
> -            if self.entity.data_sha1hex:
> -                sha1 = self.entity.data_sha1hex
> -            else:
> -                sha1 = self.entity.compute_sha1hex()
> +            hashhex = self.entity.compute_hash()

We don't use the "data_hash" anymore here and always recompute the hash.
Is this intended? If so, please explain why in the commit message.

>              size = self._cw.vreg.config['image-thumb-size']
> -            return join(cachedir, '%s_%s.png' % (sha1, size))
> +            return join(cachedir, '%s_%s.png' % (hashhex, size))
>          return None
>  
>      def thumbnail_file_name(self):
> @@ -247,7 +264,8 @@
>          return None
>  
>  
> -class UnResizeable(Exception): pass
> +class UnResizeable(Exception):
> +    pass
>  
>  
>  class IImageAdapter(EntityAdapter):
> @@ -300,4 +318,3 @@
>          ithumbnail = self.entity.cw_adapt_to('IThumbnail')
>          stream.filename = ithumbnail.thumbnail_file_name()
>          return stream
> -

> diff --git a/cubicweb_file/migration/2.1.0_Any.py b/cubicweb_file/migration/2.1.0_Any.py
> new file mode 100644
> --- /dev/null
> +++ b/cubicweb_file/migration/2.1.0_Any.py
> @@ -0,0 +1,21 @@
> +
> +subclasses = False
> +is_rel = 'is'
> +
> +if confirm('Also handle entity types that inherit from File?'):
> +    subclasses = True
> +    is_rel = 'is_instance_of'
> +
> +add_attribute('File', 'data_hash')
> +if subclasses:
> +    for etype in schema['File'].specialized_by(recursive=True):
> +        add_attribute(etype.type, 'data_hash')
> +
> +rql('SET X data_hash "{SHA1}"+H WHERE X %(rel)s File, '
> +    'X data_sha1hex H, '
> +    'NOT X data_sha1hex NULL' % is_rel)

Should be "% {'rel': is_rel}" or drop (rel).

Also does "{SHA1}"+H actually work? (I don't know RQL enough to be sure.)

Finally, a commit() wouldn't hurt here.

> +
> +drop_attribute('File', 'data_sha1hex')
> +if subclasses:
> +    for etype in schema['File'].specialized_by(recursive=True):
> +        drop_attribute(etype.type, 'data_sha1hex')


More information about the cubicweb-devel mailing list