25
loading...
This website collects cookies to deliver better user experience
txtai
and all dependencies. Since this article uses ONNX exports, we need to install the pipeline extras package.pip install txtai[pipeline,similarity] datasets
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
ds = load_dataset("emotion")
# Train the model
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('lr', LogisticRegression(max_iter=250))
])
pipeline.fit(ds["train"]["text"], ds["train"]["label"])
# Determine accuracy on validation set
results = pipeline.predict(ds["validation"]["text"])
labels = ds["validation"]["label"]
results = [results[x] == label for x, label in enumerate(labels)]
print("Accuracy =", sum(results) / len(ds["validation"]))
Accuracy = 0.8595
from txtai.pipeline import Labels, MLOnnx, Similarity
def tokenize(inputs, **kwargs):
if isinstance(inputs, str):
inputs = [inputs]
return {"input_ids": [[x] for x in inputs]}
def query(model, tokenizer, multilabel=False):
# Load models into similarity pipeline
similarity = Similarity((model, tokenizer), dynamic=False)
# Add labels to model
similarity.pipeline.model.config.id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
similarity.pipeline.model.config.label2id = dict((v, k) for k, v in similarity.pipeline.model.config.id2label.items())
inputs = ["that caught me off guard", "I didn t see that coming", "i feel bad", "What a wonderful goal!"]
scores = similarity("joy", inputs, multilabel)
for uid, score in scores[:5]:
print(inputs[uid], score)
# Export to ONNX
onnx = MLOnnx()
model = onnx(pipeline)
# Create labels pipeline using scikit-learn ONNX model
sklabels = Labels((model, tokenize), dynamic=False)
# Add labels to model
sklabels.pipeline.model.config.id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
sklabels.pipeline.model.config.label2id = dict((v, k) for k, v in sklabels.pipeline.model.config.id2label.items())
# Run test query using model
query(model, tokenize, None)
What a wonderful goal! 0.909473717212677
I didn t see that coming 0.47113093733787537
that caught me off guard 0.42067453265190125
i feel bad 0.019547615200281143
# Set predictable seeds
import os
import random
import torch
import numpy as np
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers import AutoConfig, AutoTokenizer
from txtai.models import Registry
from txtai.pipeline import HFTrainer
from transformers.modeling_outputs import SequenceClassifierOutput
def seed(seed=42):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
class Simple(nn.Module):
def __init__(self, vocab, dimensions, labels):
super().__init__()
self.config = AutoConfig.from_pretrained("bert-base-uncased")
self.labels = labels
self.embedding = nn.EmbeddingBag(vocab, dimensions)
self.classifier = nn.Linear(dimensions, labels)
self.init_weights()
def init_weights(self):
initrange = 0.5
self.embedding.weight.data.uniform_(-initrange, initrange)
self.classifier.weight.data.uniform_(-initrange, initrange)
self.classifier.bias.data.zero_()
def forward(self, input_ids=None, labels=None, **kwargs):
embeddings = self.embedding(input_ids)
logits = self.classifier(embeddings)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.labels), labels.view(-1))
return SequenceClassifierOutput(
loss=loss,
logits=logits,
)
# Set seed for reproducibility
seed()
# Define model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = Simple(tokenizer.vocab_size, 128, len(ds["train"].unique("label")))
# Train model
train = HFTrainer()
model, tokenizer = train((model, tokenizer), ds["train"], per_device_train_batch_size=8, learning_rate=1e-3, num_train_epochs=15, logging_steps=10000)
# Register custom model to fully support pipelines
Registry.register(model)
# Create labels pipeline using PyTorch model
thlabels = Labels((model, tokenizer), dynamic=False)
# Determine accuracy on validation set
results = [row["label"] == thlabels(row["text"])[0][0] for row in ds["validation"]]
print("Accuracy = ", sum(results) / len(ds["validation"]))
Accuracy = 0.883
query(model, tokenizer)
What a wonderful goal! 1.0
that caught me off guard 0.9998751878738403
I didn t see that coming 0.7328283190727234
i feel bad 5.2972134609891875e-19
from txtai.embeddings import Embeddings
class SimpleEmbeddings(nn.Module):
def __init__(self, embeddings):
super().__init__()
self.embeddings = embeddings
def forward(self, input_ids=None, **kwargs):
return (self.embeddings(input_ids),)
embeddings = Embeddings({"method": "pooling", "path": SimpleEmbeddings(model.embedding), "tokenizer": "bert-base-uncased"})
print(embeddings.similarity("mad", ["Glad you found it", "Happy to see you", "I'm angry"]))
[(2, 0.8323876857757568), (1, -0.11010512709617615), (0, -0.16152513027191162)]
train = HFTrainer()
model, tokenizer = train("microsoft/xtremedistil-l6-h384-uncased", ds["train"], logging_steps=2000)
tflabels = Labels((model, tokenizer), dynamic=False)
# Determine accuracy on validation set
results = [row["label"] == tflabels(row["text"])[0][0] for row in ds["validation"]]
print("Accuracy = ", sum(results) / len(ds["validation"]))
Accuracy = 0.93
import time
# Test inputs
inputs = ds["test"]["text"]
print("Testing speed of %d items" % len(inputs))
start = time.time()
r1 = sklabels(inputs, multilabel=None)
print("TF-IDF + Logistic Regression time =", time.time() - start)
start = time.time()
r2 = thlabels(inputs)
print("PyTorch time =", time.time() - start)
start = time.time()
r3 = tflabels(inputs)
print("Transformers time =", time.time() - start, "\n")
# Compare model results
for x in range(5):
print("index: %d" % x)
print(r1[x][0])
print(r2[x][0])
print(r3[x][0], "\n")
Testing speed of 2000 items
TF-IDF + Logistic Regression time = 1.116208791732788
PyTorch time = 2.2385385036468506
Transformers time = 15.705108880996704
index: 0
(0, 0.7258279323577881)
(0, 1.0)
(0, 0.998250424861908)
index: 1
(0, 0.854256272315979)
(0, 1.0)
(0, 0.9981004595756531)
index: 2
(0, 0.6306578516960144)
(0, 0.9999700784683228)
(0, 0.9981676340103149)
index: 3
(1, 0.554378092288971)
(1, 0.9998960494995117)
(1, 0.9985388517379761)
index: 4
(0, 0.8961835503578186)
(0, 1.0)
(0, 0.9981957077980042)