[Cubicweb] work on dataimport stores

aurélien campéas aurelien.campeas at gmail.com
Thu Feb 26 12:38:01 CET 2015


2015-02-26 9:38 GMT+01:00 Sylvain Thénault <sylvain.thenault at logilab.fr>:

> On 26 février 09:10, aurélien campéas wrote:
> > 2015-02-25 23:30 GMT+01:00 Sylvain Thénault <sylvain.thenault at logilab.fr
> >:
> > > This is probably related to the fact that I've updated the overall
> > > benchmark
> > > code to use the latest skos cube, there was a severe issue before that:
> > > hooks
> > > we actually not activated!
> > >
> >
> > Yes indeed. Wasn't it by design ? That's what I thought anyway.
>
> no that was because of a misunderstanding. If we want to compare the store
> performance vs pro and cons of each one, we should imo run the test with
> hooks
> activated.
>

If we want to compare comparable things, we should have at least a run
with hooks explicitly disabled. That's what I do in the first joint diff.

Note that fastimport may appear faster than it should because it is missing
symmetric relation handling (and I havent quantified its impact, but it
should
not be huge).

With hooks activated (followup diff) the run time explodes. It shows
how ruthlessly inefficient the cubicweb hook system is currently... but
nothing that we didn't knew before hand.

Note that fastimport precisely gives tools to fine-tune the runnable hooks
(decide which to skip or which to defer in later transactions) to work
around
this general issue.


>
> > > > Note that to make the test pass I had to de-inline the pref_label
> > > relation.
> > > >
> > > > I'm seriously considering adding inlined relation support to
> > > > .insert_relations.
> > >
> > > If not, this deserves a note in the results.txt file. Notice that iirc
> > > other stores
> > > rely on inlined relations being set through insert_entity or similar.
> >
> > Maybe but that relation is reported missing in the check after the
> > fastimport test, even though it now uses the standard skos import code.
>
> Though this used to work on my first implementation using the fastimport
> cube.
> It's unclear to me what's changed that may cause that.
>

Fixed now.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.cubicweb.org/pipermail/cubicweb/attachments/20150226/1bfd7c8d/attachment-0050.html>
-------------- next part --------------
# HG changeset patch
# User Aurelien Campeas <aurelien.campeas at logilab.fr>
# Date 1424949123 -3600
#      Thu Feb 26 12:12:03 2015 +0100
# Node ID 2fd4d30a73bd1fddfa1365fa1a6a521dad133df4
# Parent  c4917a2568d4da5d124c3a014c370cc6680efef5
run without hooks for comparability

diff --git a/benchmark.py b/benchmark.py
--- a/benchmark.py
+++ b/benchmark.py
@@ -65,11 +65,11 @@ class PostgresImportTC(CubicWebTC):
             self.assertEqual(label.label, u'communications')
             self.failIf(cnx.execute('Any L WHERE NOT EXISTS(L pref_label_of X) AND NOT EXISTS(L alt_label_of Y) AND NOT EXISTS(L hidden_label_of Z)'))
 
-    #data_file, check_imported = 'eurovoc_skos.rdf', lambda *args: None
-    data_file, check_imported = 'siaf_matieres.xml', check_siaf_shortened
+    data_file, check_imported = 'eurovoc_skos.rdf', lambda *args: None
+    #data_file, check_imported = 'siaf_matieres.xml', check_siaf_shortened
 
     @timed
-    def test_nohook(self):
+    def _test_nohook(self):
         with self.admin_access.repo_cnx() as cnx:
             _skos_import(cnx, self.data_file, 'nohookos')
         self.check_imported()
@@ -161,31 +161,32 @@ def _skos_import(cnx, fpath, impl):
     importer = importercls(cnx, store, import_log, source=cnx.repo.system_source,
                            etypes_order_hint=('ConceptScheme', 'Concept', 'Label'))
     with cnx.ensure_cnx_set:
-        def set_cwuri_if_needed(extentity, extid2eid=importer.extid2eid):
-            """set cwuri to extid when the external entity has to be created"""
-            if extentity.extid not in extid2eid:
-                extentity.values['cwuri'] = set([unicode(extentity.extid)])
-            return extentity
-        stats = importer.import_entities(imap(set_cwuri_if_needed, entities))
-        if impl in 'sqlgenos':
-            store.flush()
-        elif impl in 'fastimport':
-            from cubes.worker.testutils import run_all_tasks
-            errors = []
-            store.fc.run_deferred_hooks(errors)
-            store.flush()
-            assert not errors
+        with cnx.deny_all_hooks_but():
+            def set_cwuri_if_needed(extentity, extid2eid=importer.extid2eid):
+                """set cwuri to extid when the external entity has to be created"""
+                if extentity.extid not in extid2eid:
+                    extentity.values['cwuri'] = set([unicode(extentity.extid)])
+                return extentity
+            stats = importer.import_entities(imap(set_cwuri_if_needed, entities))
+            if impl in 'sqlgenos':
+                store.flush()
+            elif impl in 'fastimport':
+                from cubes.worker.testutils import run_all_tasks
+                errors = []
+                store.fc.run_deferred_hooks(errors)
+                store.flush()
+                assert not errors
+                cnx.commit()
+                run_all_tasks(cnx)
+                cnx.commit()
+            elif impl == 'massive':
+                store.flush_meta_data()
+                for rdef in iter_rdef(cnx.vreg.schema):
+                    store.convert_relations(*rdef)
+                store.commit()
+                store.cleanup()
             cnx.commit()
-            run_all_tasks(cnx)
-            cnx.commit()
-        elif impl == 'massive':
-            store.flush_meta_data()
-            for rdef in iter_rdef(cnx.vreg.schema):
-                store.convert_relations(*rdef)
-            store.commit()
-            store.cleanup()
-        cnx.commit()
-    return stats
+        return stats
 
 
 def iter_rdef(schema):
diff --git a/results.txt b/results.txt
--- a/results.txt
+++ b/results.txt
@@ -32,6 +32,16 @@ sqlgen           2.66      3.50
 nohook           3.97      7.79
 ========== ========== =========
 
+Auc: (all without hooks)
+========== ========== =========
+Store      time.clock time.time
+========== ========== =========
+massive          0.82      1.03
+fastimport       1.74      2.85
+sqlgen           2.33      2.95
+nohook           4.06      6.78
+========== ========== =========
+
 
 eurovoc.xml
 -----------
@@ -48,8 +58,15 @@ sqlgen          197.90    306.66
 nohook          369.23   4947.68
 ==========  ========== =========
 
-I've not yet been able to import Eurovoc using fastimport (killed by the system because it consumes
-too much memory).
+Auc: (all without hooks)
+==========  ========== =========
+Store       time.clock time.time
+==========  ========== =========
+massive          24.64     34.86
+fastimport       97.05    183.74
+sqlgen          161.51    204.00
+nohook             Nan       Nan
+==========  ========== =========
 
 
 API
-------------- next part --------------
# HG changeset patch
# User Aurelien Campeas <aurelien.campeas at logilab.fr>
# Date 1424950236 -3600
#      Thu Feb 26 12:30:36 2015 +0100
# Node ID 1db118149181d0c57944c8c0c77743694a0e44c4
# Parent  2fd4d30a73bd1fddfa1365fa1a6a521dad133df4
activate hooks (for fastimport)

diff --git a/benchmark.py b/benchmark.py
--- a/benchmark.py
+++ b/benchmark.py
@@ -65,8 +65,8 @@ class PostgresImportTC(CubicWebTC):
             self.assertEqual(label.label, u'communications')
             self.failIf(cnx.execute('Any L WHERE NOT EXISTS(L pref_label_of X) AND NOT EXISTS(L alt_label_of Y) AND NOT EXISTS(L hidden_label_of Z)'))
 
-    data_file, check_imported = 'eurovoc_skos.rdf', lambda *args: None
-    #data_file, check_imported = 'siaf_matieres.xml', check_siaf_shortened
+    #data_file, check_imported = 'eurovoc_skos.rdf', lambda *args: None
+    data_file, check_imported = 'siaf_matieres.xml', check_siaf_shortened
 
     @timed
     def _test_nohook(self):
@@ -161,32 +161,31 @@ def _skos_import(cnx, fpath, impl):
     importer = importercls(cnx, store, import_log, source=cnx.repo.system_source,
                            etypes_order_hint=('ConceptScheme', 'Concept', 'Label'))
     with cnx.ensure_cnx_set:
-        with cnx.deny_all_hooks_but():
-            def set_cwuri_if_needed(extentity, extid2eid=importer.extid2eid):
-                """set cwuri to extid when the external entity has to be created"""
-                if extentity.extid not in extid2eid:
-                    extentity.values['cwuri'] = set([unicode(extentity.extid)])
-                return extentity
-            stats = importer.import_entities(imap(set_cwuri_if_needed, entities))
-            if impl in 'sqlgenos':
-                store.flush()
-            elif impl in 'fastimport':
-                from cubes.worker.testutils import run_all_tasks
-                errors = []
-                store.fc.run_deferred_hooks(errors)
-                store.flush()
-                assert not errors
-                cnx.commit()
-                run_all_tasks(cnx)
-                cnx.commit()
-            elif impl == 'massive':
-                store.flush_meta_data()
-                for rdef in iter_rdef(cnx.vreg.schema):
-                    store.convert_relations(*rdef)
-                store.commit()
-                store.cleanup()
+        def set_cwuri_if_needed(extentity, extid2eid=importer.extid2eid):
+            """set cwuri to extid when the external entity has to be created"""
+            if extentity.extid not in extid2eid:
+                extentity.values['cwuri'] = set([unicode(extentity.extid)])
+            return extentity
+        stats = importer.import_entities(imap(set_cwuri_if_needed, entities))
+        if impl in 'sqlgenos':
+            store.flush()
+        elif impl in 'fastimport':
+            from cubes.worker.testutils import run_all_tasks
+            errors = []
+            store.flush()
+            assert not errors
             cnx.commit()
-        return stats
+            store.fc.run_deferred_hooks(errors)
+            run_all_tasks(cnx)
+            cnx.commit()
+        elif impl == 'massive':
+            store.flush_meta_data()
+            for rdef in iter_rdef(cnx.vreg.schema):
+                store.convert_relations(*rdef)
+            store.commit()
+            store.cleanup()
+        cnx.commit()
+    return stats
 
 
 def iter_rdef(schema):
diff --git a/results.txt b/results.txt
--- a/results.txt
+++ b/results.txt
@@ -32,12 +32,13 @@ sqlgen           2.66      3.50
 nohook           3.97      7.79
 ========== ========== =========
 
-Auc: (all without hooks)
+Auc:
 ========== ========== =========
 Store      time.clock time.time
 ========== ========== =========
 massive          0.82      1.03
-fastimport       1.74      2.85
+fi nohooks       1.74      2.85
+fi hooks        96.59    102.50
 sqlgen           2.33      2.95
 nohook           4.06      6.78
 ========== ========== =========


More information about the Cubicweb mailing list