Merge pull request #71 from inception-project/feature/70-Provide-demo-for-recommender-that-suggests-multiple-features-at-the-same-time

reckart · web-flow · commit e1a1a90c563d · 2025-10-28T08:20:05.000+01:00
#70 - Provide demo for recommender that suggests multiple features at the same time
diff --git a/ariadne/contrib/sbert.py b/ariadne/contrib/sbert.py
@@ -78,9 +78,9 @@ def fit(self, documents: List[TrainingDocument], layer: str, feature: str, proje
                 else:
                     continue
 
-                assert (
-                    sentence.begin == annotation.begin and sentence.end == annotation.end
-                ), "Annotation should cover sentence fully!"
+                assert sentence.begin == annotation.begin and sentence.end == annotation.end, (
+                    "Annotation should cover sentence fully!"
+                )
 
                 label = getattr(annotation, feature)
 
diff --git a/ariadne/contrib/sklearn.py b/ariadne/contrib/sklearn.py
@@ -51,9 +51,9 @@ def fit(self, documents: List[TrainingDocument], layer: str, feature: str, proje
                 else:
                     continue
 
-                assert (
-                    sentence.begin == annotation.begin and sentence.end == annotation.end
-                ), "Annotation should cover sentence fully!"
+                assert sentence.begin == annotation.begin and sentence.end == annotation.end, (
+                    "Annotation should cover sentence fully!"
+                )
 
                 label = getattr(annotation, feature)
 
diff --git a/ariadne/demo/demo_link_feature.py b/ariadne/demo/demo_link_feature.py
@@ -28,6 +28,30 @@
 
 
 class DemoLinkFeatureRecommender(Classifier):
+    """Simple demo recommender that learns link roles between span annotations.
+
+    Training
+    --------
+    For each document, we iterate over all annotations of the given
+    ``layer`` and reads the ``feature`` field which is expected to contain
+    link objects (UIMA link relations). It counts how often a source span
+    text (lowercased) was linked to a particular target span text with a
+    given role. The model stored per-user is a nested mapping:
+    ``{source_text: {target_text: best_role}}``, where ``best_role`` is the
+    role with the highest frequency for that (source, target) pair.
+
+    Prediction
+    ----------
+    The ``predict`` method loads the per-user model and iterates over source
+    annotations in the CAS. For each source whose lowercased covered text
+    appears in the model, it looks for target annotations of the same
+    ``layer`` inside the covering sentence. If a target's lowercased text
+    matches a target recorded for the source, the recommender creates a span
+    prediction suggestion that contains a link to the found target using the
+    learned role. The suggestion is added to the CAS as a span prediction
+    feature structure.
+    """
+
     def fit(self, documents: List[TrainingDocument], layer: str, feature: str, project_id, user_id: str):
         logger.info(
             f"Training triggered for [{feature}] on [{layer}] in [{len(documents)}] documents from project [{project_id}] for user [{user_id}]"
diff --git a/ariadne/demo/demo_multiple_features.py b/ariadne/demo/demo_multiple_features.py
@@ -0,0 +1,141 @@
+# Licensed to the Technische Universität Darmstadt under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The Technische Universität Darmstadt
+# licenses this file to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Dict
+
+from cassis import Cas
+
+from ariadne.classifier import Classifier
+from ariadne.protocol import TrainingDocument
+from collections import defaultdict
+from ariadne.contrib.inception_util import create_span_prediction, TOKEN_TYPE
+from cassis.typesystem import TYPE_NAME_STRING
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class DemoMultipleFeaturesRecommender(Classifier):
+    """
+    Demo recommender that behaves like DemoStringFeatureRecommender but ignores the
+    provided `feature` parameter and instead trains/predicts on all string-typed
+    features of the specified layer. A separate dictionary is learned for each
+    feature (mapping mention -> best label).
+
+    This recommender requires INCEpTION 39.0 or higher. Older versions of INCEpTION
+    will only extract the configured feature, even though multiple features are trained
+    and predicted.
+    """
+
+    def _get_string_features(self, cas: Cas, layer: str) -> List[str]:
+        """Return the names of all features of `layer` whose range is a string."""
+        try:
+            AnnotationType = cas.typesystem.get_type(layer)
+        except Exception:
+            return []
+
+        features = []
+        for feat in AnnotationType.features:
+            try:
+                if feat.rangeType.name == TYPE_NAME_STRING:
+                    features.append(feat.name)
+            except Exception:
+                # best-effort: skip features we cannot introspect
+                continue
+
+        return features
+
+    def fit(self, documents: List[TrainingDocument], layer: str, feature: str, project_id, user_id: str):
+        logger.info(
+            "Training triggered for all string features on [%s] in [%d] documents from project [%s] for user [%s]",
+            layer,
+            len(documents),
+            project_id,
+            user_id,
+        )
+
+        # counts: feature -> mention -> label -> count
+        counts: Dict[str, Dict[str, Dict[str, int]]] = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
+
+        features_discovered = None
+
+        for document in documents:
+            cas = document.cas
+
+            if features_discovered is None:
+                features_discovered = self._get_string_features(cas, layer)
+
+            for annotation in cas.select(layer):
+                mention = annotation.get_covered_text().lower()
+
+                if not mention:
+                    continue
+
+                for feat in features_discovered or []:
+                    label = annotation.get(feat)
+                    if not label:
+                        continue
+                    counts[feat][mention][label] += 1
+
+        # For each feature, compute best_labels mapping mention -> top label
+        model: Dict[str, Dict[str, str]] = {}
+        for feat, mention_map in counts.items():
+            best_labels = {
+                mention: max(candidate_counts, key=candidate_counts.get) if candidate_counts else ""
+                for mention, candidate_counts in mention_map.items()
+            }
+            model[feat] = best_labels
+
+        logger.info("Trained multiple-feature model for features: %s", list(model.keys()))
+        self._save_model(user_id, model)
+
+        logger.info("Training finished for user [%s]", user_id)
+
+    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
+        logger.info(
+            "Prediction triggered on document [%s] for all string features on [%s] in project [%s] for user [%s]",
+            document_id,
+            layer,
+            project_id,
+            user_id,
+        )
+
+        model = self._load_model(user_id)
+        if model is None:
+            return
+
+        # Determine which string features to predict (use typesystem from cas)
+        features = self._get_string_features(cas, layer)
+
+        # For each token, try to predict for each discovered string feature if the token text
+        # exists in the per-feature dictionary
+        suggestion_count = 0
+        for token in cas.select(TOKEN_TYPE):
+            mention = token.get_covered_text().lower()
+            for feat in features:
+                feature_model = model.get(feat)
+                if not feature_model:
+                    continue
+                if mention in feature_model:
+                    label = feature_model.get(mention)
+                    suggestion = create_span_prediction(
+                        cas, layer, feat, token.begin, token.begin + len(mention), label
+                    )
+                    logger.info("Creating suggestion for feature [%s]: %s -> %s", feat, mention, label)
+                    cas.add(suggestion)
+                    suggestion_count += 1
+
+        logger.info("Prediction finished for user [%s]; suggestions created: %d", user_id, suggestion_count)
diff --git a/scripts/util.py b/scripts/util.py
@@ -24,7 +24,6 @@
 
 
 def download_file(url: str, target_path: Path):
-
     if target_path.exists():
         logging.info("File already exists: [%s]", str(target_path.resolve()))
         return
diff --git a/tests/demo/test_demo_multiple_features.py b/tests/demo/test_demo_multiple_features.py
@@ -0,0 +1,71 @@
+# Licensed to the Technische Universität Darmstadt under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The Technische Universität Darmstadt
+# licenses this file to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ariadne.demo.demo_multiple_features import DemoMultipleFeaturesRecommender
+from ariadne.protocol import TrainingDocument
+from cassis import Cas
+from ariadne.contrib.inception_util import create_span_prediction, TOKEN_TYPE
+from tests.util import create_cas
+from ariadne.contrib.inception_util import IS_PREDICTION
+
+
+def test_demo_multiple_features_fit_and_predict():
+    # Prepare a training CAS
+    cas_train = create_cas()
+    cas_train.sofa_string = "Hello world"
+    # Create a custom predicted type that has two string features so the recommender
+    # actually learns separate dictionaries per feature
+    ts = cas_train.typesystem
+    CustomPred = ts.create_type("ariadne.testtype_multi")
+    ts.create_feature(CustomPred, "value1", "uima.cas.String")
+    ts.create_feature(CustomPred, "value2", "uima.cas.String")
+    ts.create_feature(CustomPred, IS_PREDICTION, "uima.cas.Boolean")
+
+    # Add two training annotations on the same covered text but with different feature values
+    span1 = create_span_prediction(cas_train, "ariadne.testtype_multi", "value1", 0, 5, "GREETING")
+    span2 = create_span_prediction(cas_train, "ariadne.testtype_multi", "value2", 0, 5, "HELLO")
+    cas_train.add(span1)
+    cas_train.add(span2)
+
+    docs = [TrainingDocument(cas_train, "doc1", "user1")]
+
+    recommender = DemoMultipleFeaturesRecommender()
+    # feature argument is ignored by the recommender
+    recommender.fit(docs, "ariadne.testtype_multi", "value1", project_id=1, user_id="user1")
+
+    # Create a new CAS to predict into using the same typesystem as the training CAS
+    predict_cas = Cas(cas_train.typesystem)
+    predict_cas.sofa_string = cas_train.sofa_string
+
+    Token = predict_cas.typesystem.get_type(TOKEN_TYPE)
+    predict_cas.add(Token(begin=0, end=5))
+    predict_cas.add(Token(begin=6, end=11))
+
+    recommender.predict(
+        predict_cas,
+        "ariadne.testtype_multi",
+        "value1",
+        project_id=1,
+        document_id="doc1",
+        user_id="user1",
+    )
+
+    # After prediction there should be predictions for both features
+    preds_v1 = [a for a in predict_cas.select("ariadne.testtype_multi") if a.get("value1") == "GREETING"]
+    preds_v2 = [a for a in predict_cas.select("ariadne.testtype_multi") if a.get("value2") == "HELLO"]
+
+    assert len(preds_v1) >= 1
+    assert len(preds_v2) >= 1
diff --git a/wsgi.py b/wsgi.py
@@ -13,6 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ariadne.demo.demo_link_feature import DemoLinkFeatureRecommender
+from ariadne.demo.demo_multiple_features import DemoMultipleFeaturesRecommender
+from ariadne.demo.demo_relation import DemoRelationLayerRecommender
+from ariadne.demo.demo_string_array_feature import DemoStringArrayFeatureRecommender
+from ariadne.demo.demo_string_feature import DemoStringFeatureRecommender
 from ariadne.server import Server
 from ariadne.util import setup_logging
 from ariadne.contrib.spacy import SpacyNerClassifier
@@ -21,10 +26,11 @@
 
 server = Server()
 
-# server.add_classifier("demo_string_feature", DemoStringFeatureRecommender())
-# server.add_classifier("demo_string_array_feature", DemoStringArrayFeatureRecommender())
-# server.add_classifier("demo_link_feature", DemoLinkFeatureRecommender())
-# server.add_classifier("demo_relation_layer", DemoRelationLayerRecommender())
+server.add_classifier("demo_string_feature", DemoStringFeatureRecommender())
+server.add_classifier("demo_string_array_feature", DemoStringArrayFeatureRecommender())
+server.add_classifier("demo_link_feature", DemoLinkFeatureRecommender())
+server.add_classifier("demo_relation_layer", DemoRelationLayerRecommender())
+server.add_classifier("demo_multiple_features", DemoMultipleFeaturesRecommender())
 
 server.add_classifier("spacy_ner", SpacyNerClassifier("en_core_web_sm"))
 # server.add_classifier("spacy_pos", SpacyPosClassifier("en_core_web_sm"))