[PATCH 08 of 11 saem_ref] [ark] Introduce data declaration for "ark" table and PL/pgSQL function for identifier generation

Denis Laxalde denis.laxalde at logilab.fr
Mon Feb 12 15:20:44 CET 2018


# HG changeset patch
# User Denis Laxalde <denis.laxalde at logilab.fr>
# Date 1518430204 -3600
#      Mon Feb 12 11:10:04 2018 +0100
# Node ID cdc498fe733dd4496575c874140c10e21e66fd3f
# Parent  43cf7062f7aa13401eb6e7dc839a621d72e8b7c8
# Available At http://hg.logilab.org/review/cubes/saem_ref
#              hg pull http://hg.logilab.org/review/cubes/saem_ref -r cdc498fe733d
# EXP-Topic ark/postgres-generated
[ark] Introduce data declaration for "ark" table and  PL/pgSQL function for identifier generation

We add two SQL scripts in schema/ directory, one to install the
"pgcrypto" extension (needed for random()) and one containing "ark"
table definition and a couple of function to generate unique and random
identifiers:

* `gen_ark_name` cares about the "name" part of ARK identifiers and
  returns a random text value ensuring structural constraints are
  fulfilled (no vowels, no more than 3 consecutive letters, possibly a
  prefix and a trailing control character)
* `random_char` is a simpler helper producing a random string from a set
  of characters
* `gen_ark` is the main function that will be used to generate records
  in the "ark" table; it loops by trying to INSERT a record with
  randomly generated column values (only the "name" part for now) and
  will return once there is no unique constraint violation

Python tests added in test_ark.py. As noted in comments of the test case
class, we use CubicWebTC despite we don't need access to a CubicWeb
database (nor schema): this is only by convenience to get the postgresql
cluster setup (and teardown) at the cost for more expensive test
runtime.

Related to extranet #46881387.

diff --git a/MANIFEST.in b/MANIFEST.in
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 recursive-include cubicweb_saem_ref/data *.gif *.jpg *.png *.ico *.css *.js *.xml
 recursive-include test/data *.xml *.xsd *.rdf *.csv
 include test/*.py test/data/bootstrap_cubes
+include cubicweb_saem_ref/schema/*.sql
 include cubicweb_saem_ref/i18n/*.po
 include cubicweb_saem_ref/i18n/*.pot
 include cubicweb_saem_ref/migration/*.py
diff --git a/cubicweb_saem_ref/schema/_ext.postgres.sql b/cubicweb_saem_ref/schema/_ext.postgres.sql
new file mode 100644
--- /dev/null
+++ b/cubicweb_saem_ref/schema/_ext.postgres.sql
@@ -0,0 +1,1 @@
+CREATE EXTENSION IF NOT EXISTS pgcrypto;
diff --git a/cubicweb_saem_ref/schema/ark.postgres.sql b/cubicweb_saem_ref/schema/ark.postgres.sql
new file mode 100644
--- /dev/null
+++ b/cubicweb_saem_ref/schema/ark.postgres.sql
@@ -0,0 +1,73 @@
+CREATE TABLE ark (
+    name TEXT NOT NULL,
+    qualifier TEXT NOT NULL DEFAULT '',
+    CONSTRAINT qualified_name PRIMARY KEY (name, qualifier)
+);
+
+-- Return a part of an ARK identifier of `len` length with `prefix` prefix and
+-- `control_char` trailing character.
+CREATE OR REPLACE
+FUNCTION gen_ark_name(len INTEGER, prefix TEXT, control_char TEXT)
+RETURNS text AS $$
+DECLARE
+    ark TEXT;
+    letters TEXT;
+    numbers TEXT;
+    characters TEXT;
+    next_char TEXT;
+    tail TEXT;
+BEGIN
+    ASSERT len > length(prefix) + length(control_char);
+    letters := 'bcdfghjkmnpqrstvwxz';
+    numbers := '0123456789';
+    characters := letters || numbers;
+    -- Prefix ARK identifier.
+    ark := prefix;
+    LOOP
+        tail := substring(ark, length(ark) - 2, 3);
+        -- No more than 3 consecutive consonants.
+        IF tail ~ '([a-z]){3}' THEN
+            next_char := random_char(numbers);
+        ELSE
+            next_char := random_char(characters);
+        END IF;
+        ark := ark || next_char;
+        IF length(ark) = len - 1 THEN
+            EXIT;
+        END IF;
+    END LOOP;
+    -- Add control character.
+    RETURN ark || control_char;
+END;
+$$ language 'plpgsql';
+
+-- Return an random character within given string
+CREATE OR REPLACE FUNCTION random_char(chars TEXT) RETURNS text AS $$
+BEGIN
+    RETURN substr(chars, (random() * (length(chars) -1) + 1)::INTEGER, 1);
+END;
+$$ language 'plpgsql';
+
+-- Insert a record in "ark" table ensuring uniqueness constraints are
+-- fulfilled.
+CREATE OR REPLACE
+FUNCTION gen_ark(len INTEGER, prefix TEXT, control_char TEXT)
+RETURNS TEXT AS $$
+DECLARE
+    ark_name TEXT;
+BEGIN
+    LOOP
+        BEGIN
+            INSERT INTO ark
+                VALUES (gen_ark_name(len, prefix, control_char), DEFAULT)
+                RETURNING name INTO ark_name;
+            RETURN ark_name;
+        EXCEPTION
+            WHEN unique_violation THEN
+                -- Continue and try with another "name".
+                RAISE WARNING 'unique_violation';
+                NULL;
+        END;
+    END LOOP;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/dev-requirements.txt b/dev-requirements.txt
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,3 +1,4 @@
 mock
+psycopg2
 pytest
 webtest
diff --git a/doc/ark.rst b/doc/ark.rst
--- a/doc/ark.rst
+++ b/doc/ark.rst
@@ -181,4 +181,15 @@ Une fois le ``NAAN`` désigné, le ``Nam
 * 'oa' pour les agents (`Agent`),
 
 * 'a' pour les identifiants attribués par le web-service d'assignement d'identifiant ARK, en général
-  pour les archives proprement dites.
\ No newline at end of file
+  pour les archives proprement dites.
+
+Notes pour la réimplémentation de la fonctionnalité de génération d'identifiants ARK
+====================================================================================
+
+* http://wiki.postgresql.org/wiki/Pseudo_encrypt
+* https://pypi.python.org/pypi/pynoid
+* https://blog.andyet.com/2016/02/23/generating-shortids-in-postgres
+* https://www.postgresql.org/docs/current/static/pgcrypto.html
+* http://rob.conery.io/2014/05/28/a-better-id-generator-for-postgresql/
+* http://github.com/mjgiarlo/arkpy
+* https://github.com/rjacobson16/ark_service
diff --git a/test/test_ark.py b/test/test_ark.py
new file mode 100644
--- /dev/null
+++ b/test/test_ark.py
@@ -0,0 +1,103 @@
+# copyright 2018 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact at logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as publishged by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+"""cubicweb-saem-ref tests for ARK identifier generation."""
+
+import psycopg2
+
+from cubicweb import devtools
+from cubicweb.devtools import (
+    PostgresApptestConfiguration,
+    testlib,
+)
+
+import testutils
+
+
+def setUpModule():
+    testutils.startpgcluster(__file__)
+
+
+class ArkServiceTC(testlib.CubicWebTC):
+    # This could be a plain unittest.TestCase as we don't use CubicWeb's
+    # database. But on the other hand, CubicWebTC makes it easier to get a
+    # postgres configuration handling setup/teardown.
+
+    configcls = PostgresApptestConfiguration
+
+    @classmethod
+    def connect(cls):
+        db_handler = devtools.get_test_db_handler(cls.config, cls.init_config)
+        optmapping = [
+            ('dbname', 'db-name'),
+            ('user', 'db-user'),
+            ('password', 'db-password'),
+            ('host', 'db-host'),
+            ('port', 'db-port'),
+        ]
+        connect_parameters = {k: db_handler.system_source[v]
+                              for k, v in optmapping}
+        return psycopg2.connect(**connect_parameters)
+
+    def test_unique_generated(self):
+        """We should be able to produce 20 ARK identifiers of length 4 with a
+        2-characters prefix and 1 control character.
+        """
+        n = 20
+        arks = []
+        with self.connect() as conn:
+            with conn.cursor() as cu:
+                for _ in range(n):
+                    cu.execute("SELECT * from gen_ark(%s, %s, %s);",
+                               (4, 'rf', 'x'))
+                    arks.append(cu.fetchone()[0])
+                    conn.commit()
+                cu.execute("SELECT * from ark")
+                r = cu.fetchall()
+        self.assertEqual(len(set(r)), n, len(r))
+        self.assertCountEqual([ark for ark, _ in r], arks)
+
+    def test_qualifier(self):
+        def insert(qualifier=None):
+            with self.connect() as conn:
+                with conn.cursor() as cu:
+                    cu.execute("INSERT INTO ark VALUES (%s, %s);",
+                               ("he", qualifier))
+        insert("ah")
+        with self.assertRaises(psycopg2.IntegrityError) as cm:
+            insert("ah")
+        self.assertIn(
+            'duplicate key value violates unique constraint "qualified_name"',
+            str(cm.exception),
+        )
+        insert("he")
+
+    def test_dup(self):
+        def insert():
+            with self.connect() as conn:
+                with conn.cursor() as cu:
+                    cu.execute("INSERT INTO ark VALUES ('he', 'aha');")
+        insert()
+        with self.assertRaises(psycopg2.IntegrityError) as cm:
+            insert()
+        self.assertIn(
+            'duplicate key value violates unique constraint "qualified_name"',
+            str(cm.exception),
+        )
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.main()


More information about the saem-devel mailing list