[PATCH 2 of 2 saem_ref v2] [ark] Insert "external" ARK identifiers into ark table

Denis Laxalde denis.laxalde at logilab.fr
Wed Feb 21 17:17:13 CET 2018


# HG changeset patch
# User Denis Laxalde <denis.laxalde at logilab.fr>
# Date 1519227658 -3600
#      Wed Feb 21 16:40:58 2018 +0100
# Node ID 5462e52e5c3e402e3f1f79cf93f80902d0c36230
# Parent  00a7985081f1e0c376e9dd2d7de605ce9e7e3bd7
# Available At http://hg.logilab.org/review/cubes/saem_ref
#              hg pull http://hg.logilab.org/review/cubes/saem_ref -r 5462e52e5c3e
# EXP-Topic ark/unique-overall
[ark] Insert "external" ARK identifiers into ark table

When calling set_ark_and_cwuri() (from a hook upon entity addition or
from a dataimport metadata generator), we now insert a record into "ark"
table when an "ark" value is specified in entity.cw_edited (meaning the
ARK identifier is externally given). Afterwards, the value returned by
insert_ark() is used to update cw_edited dict to ensure consistency with
parsed value (in particular, users may specify the "ark:/" scheme, which
we do not store).

When an external ARK identifier is given, we only check it is correctly
formatted but don't check for our internal constraints (presence of a
prefix and control character, etc.).

By inserting external ARK identifiers in this "ark" table, we can now
ensure uniqueness of values across entity types, thus solving a major
integrity bug.

diff --git a/cubicweb_saem_ref/hooks.py b/cubicweb_saem_ref/hooks.py
--- a/cubicweb_saem_ref/hooks.py
+++ b/cubicweb_saem_ref/hooks.py
@@ -17,6 +17,8 @@
 
 from datetime import datetime
 
+import psycopg2
+
 from logilab.common.registry import objectify_predicate
 
 from yams import ValidationError
@@ -30,6 +32,10 @@ from cubicweb.server import hook
 from cubicweb.hooks import metadata
 
 from cubicweb_skos.hooks import ReplaceExternalUriByEntityHook
+from .ark import (
+    match as ark_match,
+    insert_ark,
+)
 
 
 # skip relations involved in the logging itself and some others that should not
@@ -213,7 +219,21 @@ class AssignARKHook(hook.Hook):
 
 def set_ark_and_cwuri(cw, entity_attrs, naa_what=None):
     ark = entity_attrs.get('ark')
-    if not ark:
+    if ark:
+        m = ark_match(ark, external=True)
+        msg = _('malformatted ARK idenfitier %(ark)s (expecting [ark:/]NAAN/Name[Qualifier])')
+        msgargs = {'ark': ark}
+        if m is None:
+            raise ValidationError(None, {'ark': msg}, msgargs=msgargs)
+        matched = m.groupdict()
+        naan, name = matched.pop('naan'), matched.pop('name')
+        qualifier = matched.get('qualifier')
+        try:
+            ark = insert_ark(cw, naan, name, qualifier)
+        except psycopg2.IntegrityError:
+            raise ValidationError(
+                None, {'ark': _('%(ark)s already exists')}, msgargs={'ark': ark})
+    else:
         cwuri = entity_attrs.get('cwuri')
         ark = None if cwuri is None else extract_ark(cwuri)
         if ark is None:
@@ -225,7 +245,7 @@ def set_ark_and_cwuri(cw, entity_attrs, 
                 'IARKGenerator', cw,
                 naa_what=naa_what, **entity_attrs)
             ark = generator.generate_ark()
-        entity_attrs['ark'] = ark
+    entity_attrs['ark'] = ark
     if 'cwuri' not in entity_attrs:
         # store ark as cwuri, not an URL so it's easier to move the database while still easy to get
         # an URL from there (see the cwuri_url function) XXX (syt) any other reason to do so? there
diff --git a/test/unittest_hooks.py b/test/unittest_hooks.py
--- a/test/unittest_hooks.py
+++ b/test/unittest_hooks.py
@@ -21,6 +21,7 @@ import unittest
 
 import pytz
 
+from cubicweb import ValidationError
 from cubicweb.devtools import PostgresApptestConfiguration
 from cubicweb.devtools.testlib import CubicWebTC
 
@@ -233,6 +234,33 @@ class ARKGenerationHooksTC(CubicWebTC):
         self._check_ark(entity, **kwargs)
         self.assertTrue(testutils.match_ark_uri(entity.cwuri), entity.cwuri)
 
+    def test_external_ark_invalid(self):
+        with self.admin_access.cnx() as cnx:
+            for ark in [
+                u'string/name',  # NAAN is not a number
+                u'123',  # Missing Name
+            ]:
+                with self.subTest(ark=ark):
+                    with self.assertRaises(ValidationError) as cm:
+                        testutils.authority_record(cnx, u'bob', ark=ark)
+                    self.assertIn('malformatted ARK idenfitier', str(cm.exception))
+
+    def test_external_ark_duplicated(self):
+        ark = '1/bob/thefirst'
+        with self.admin_access.cnx() as cnx:
+            cnx.create_entity('Agent', name=u'bob', ark=ark)
+            with self.assertRaises(ValidationError) as cm:
+                cnx.create_entity('ConceptScheme', ark=ark)
+            self.assertIn('already exists', str(cm.exception))
+
+    def test_external_ark_duplicated_generated(self):
+        with self.admin_access.cnx() as cnx:
+            authority = testutils.authority_with_naa(cnx)
+            agent = cnx.create_entity('Agent', name=u'bob', authority=authority)
+            with self.assertRaises(ValidationError) as cm:
+                cnx.create_entity('ConceptScheme', ark=agent.ark)
+            self.assertIn('already exists', str(cm.exception))
+
     def test_ark_generation_authorityrecord(self):
         with self.admin_access.repo_cnx() as cnx:
             agent = testutils.authority_record(cnx, u'bob')



More information about the saem-devel mailing list