Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']

steps:
- uses: actions/checkout@v3
Expand All @@ -22,6 +22,7 @@ jobs:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools wheel
pip install --upgrade -e .[contrib,test]
pip install flake8
- name: Lint with flake8
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,4 @@ dmypy.json
/lvenv/
/models/*
/cache
.vscode
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ PYTHON_FILES = tests
test:
python -m pytest -m "not performance" tests/

gunicorn:
serve:
gunicorn -w 4 -b 127.0.0.1:5000 --reload wsgi:app

black:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,6 @@ do not need to run INCEpTION during (early) development.

The simplest way to develop in deployment setting, that is using `gunicorn` is to just run

make gunicorn
make serve

This starts `gunicorn` with 4 workers and hot-code reloading.
4 changes: 2 additions & 2 deletions ariadne/contrib/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
end = token.end
label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0]
prediction = create_prediction(cas, layer, feature, begin, end, label)
cas.add_annotation(prediction)
cas.add(prediction)

def _tokenize_bert(self, cas_tokens: List[str]) -> List[torch.Tensor]:
grouped_bert_tokens = [torch.LongTensor([self._tokenizer.cls_token_id])]
Expand Down Expand Up @@ -191,7 +191,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
label_id = torch.argmax(outputs[0]).item()
label = self._label_map[label_id]
prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
cas.add_annotation(prediction)
cas.add(prediction)

def _build_model(self):
model = AutoModelWithHeads.from_pretrained(self._base_model_name)
Expand Down
14 changes: 5 additions & 9 deletions ariadne/contrib/flair.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ def fix_whitespaces(cas_tokens):
dist = following_cas_token.begin - cas_token.end
else:
dist = 1
token = Token(
cas_token.get_covered_text(),
whitespace_after=dist,
start_position=cas_token.begin
)
token = Token(cas_token.get_covered_text(), whitespace_after=dist, start_position=cas_token.begin)
tokens.append(token)
return tokens

Expand All @@ -46,7 +42,7 @@ def __init__(self, model_name: str, model_directory: Path = None, split_sentence
self._model = Tagger.load(model_name)
self._split_sentences = split_sentences

def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
# Extract the sentences from the CAS
if self._split_sentences:
sentences = []
Expand All @@ -67,18 +63,18 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
end = named_entity.end_position
label = named_entity.tag
prediction = create_prediction(cas, layer, feature, begin, end, label)
cas.add(prediction)
cas.add(prediction)

else:
cas_tokens = cas.select(TOKEN_TYPE)
text = fix_whitespaces(cas_tokens)
sent = Sentence(text)

self._model.predict(sent)

for named_entity in sent.get_spans():
begin = named_entity.start_position
end = named_entity.end_position
label = named_entity.tag
prediction = create_prediction(cas, layer, feature, begin, end, label)
cas.add(prediction)
cas.add(prediction)
8 changes: 6 additions & 2 deletions ariadne/contrib/log_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@

class LogOnlyRecommender(Classifier):
def fit(self, documents: List[TrainingDocument], layer: str, feature: str, project_id, user_id: str):
print(f'Training triggered for [{feature}] on [{layer}] in [{len(documents)}] documents from project [{project_id}] for user [{user_id}]')
print(
f"Training triggered for [{feature}] on [{layer}] in [{len(documents)}] documents from project [{project_id}] for user [{user_id}]"
)

def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
print(f'Prediction triggered on document [{document_id}] for [{feature}] on [{layer}] in project [{project_id}] for user [{user_id}]')
print(
f"Prediction triggered on document [{document_id}] for [{feature}] on [{layer}] in project [{project_id}] for user [{user_id}]"
)
2 changes: 1 addition & 1 deletion ariadne/contrib/nltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
begin = cas_token.begin
end = begin + len(stem)
prediction = create_prediction(cas, layer, feature, begin, end, stem)
cas.add_annotation(prediction)
cas.add(prediction)
2 changes: 1 addition & 1 deletion ariadne/contrib/simalign.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
prediction = create_relation_prediction(
cas, layer, feature, src_tokens[source_idx], trg_tokens[target_idx], ""
)
cas.add_annotation(prediction)
cas.add(prediction)
break
2 changes: 1 addition & 1 deletion ariadne/contrib/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
if begin is not None and end is not None:
if tag == "O" or (tag.startswith("B") and prev_tag.startswith("I")):
prediction = create_prediction(cas, layer, feature, begin, end, "X")
cas.add_annotation(prediction)
cas.add(prediction)

if tag.startswith("B"):
begin = token.begin
Expand Down
2 changes: 1 addition & 1 deletion ariadne/contrib/stringmatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_
for mention, label_id in m.search(term=term, max_dist=2):
label = le.inverse_transform([label_id])[0]
prediction = create_prediction(cas, layer, feature, begin, end, label)
cas.add_annotation(prediction)
cas.add(prediction)

def _generate_candidates(self, cas: Cas, n: int):
# We generate token n-grams
Expand Down
15 changes: 6 additions & 9 deletions ariadne/contrib/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from ariadne.classifier import Classifier
from ariadne.contrib.inception_util import create_prediction
from cassis import Cas


class TransformerNerClassifier(Classifier):
def __init__(self, model_name: str):
super().__init__()
Expand All @@ -27,16 +28,12 @@ def __init__(self, model_name: str):
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="first")



def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):

document_text = cas.sofa_string
predictions = self.ner_pipeline(document_text)
for prediction in predictions:
start_char = prediction['start']
end_char = prediction['end']
label = prediction['entity_group']
start_char = prediction["start"]
end_char = prediction["end"]
label = prediction["entity_group"]
cas_prediction = create_prediction(cas, layer, feature, start_char, end_char, label)
cas.add(cas_prediction)

cas.add(cas_prediction)
100 changes: 100 additions & 0 deletions ariadne/demo/demo_link_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Licensed to the Technische Universität Darmstadt under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The Technische Universität Darmstadt
# licenses this file to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List

from cassis import Cas

from ariadne.classifier import Classifier
from ariadne.protocol import TrainingDocument
from collections import defaultdict
from ariadne.contrib.inception_util import create_span_prediction

import logging

logger = logging.getLogger(__name__)


class DemoLinkFeatureRecommender(Classifier):
def fit(self, documents: List[TrainingDocument], layer: str, feature: str, project_id, user_id: str):
logger.info(
f"Training triggered for [{feature}] on [{layer}] in [{len(documents)}] documents from project [{project_id}] for user [{user_id}]"
)

# Count how often each mention has been annotated with a given label
counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

for document in documents:
cas = document.cas
for annotation in cas.select(layer):
source = annotation.get_covered_text().lower()
links = annotation.get(feature)
if links:
for link in links.elements:
target = link.target.get_covered_text().lower()
role = link.role.lower()
counts[source][target][role] += 1

# Create a new dictionary that contains only the source/target/role with the highest count
# for each link
best_links = {
source: {
target: max(candidate_counts, key=candidate_counts.get) if candidate_counts else ""
for target, candidate_counts in target_counts.items()
}
for source, target_counts in counts.items()
}

logger.info("Best labels: %s", best_links)
self._save_model(user_id, best_links)

logger.info("Training finished for user [%s]", user_id)

def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
logger.info(
f"Prediction triggered on document [{document_id}] for [{feature}] on [{layer}] in project [{project_id}] for user [{user_id}]"
)

model = self._load_model(user_id)

if model is None:
return

# Look for source annotations in the CAS and check if any of the mentions in the model correspond to the text starting
# at that token
for source in cas.select(layer):
source_text = source.get_covered_text().lower()
if source_text in model:
sentence = list(
cas.select_covering("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", source)
)
if not sentence:
continue

# Look for a suitable target annotation in the same sentence
for target in cas.select_covered(layer, sentence[0]):
target_text = target.get_covered_text().lower()

# Source and target exist, create a link with the appropriate role
if target_text in model[source_text]:
role = model[source_text][target_text]
LinkType = cas.typesystem.get_type("custom.SpanLinksLink")
link = LinkType(role=role, target=target)
FSArray = cas.typesystem.get_type("uima.cas.FSArray")
links = FSArray(elements=[link])
suggestion = create_span_prediction(cas, layer, feature, source.begin, source.end, links)
cas.add(suggestion)

logger.info("Prediction finished for user [%s]", user_id)
Loading