52
loading...
This website collects cookies to deliver better user experience
txtai
and all dependencies.pip install txtai
import random
from datasets import load_dataset
from txtai.pipeline import Labels
def batch(texts, size):
return [texts[x : x + size] for x in range(0, len(texts), size)]
# Set random seed for repeatable sampling
random.seed(42)
ds = load_dataset("glue", "sst2")
sentences = random.sample(ds["train"]["sentence"], 1000)
# Load a zero shot classifier - txtai provides this through the Labels pipeline
labels = Labels("microsoft/deberta-large-mnli")
train = []
# Zero-shot prediction using ["negative", "positive"] labels
for chunk in batch(sentences, 32):
train.extend([{"text": chunk[x], "label": label[0][0]} for x, label in enumerate(labels(chunk, ["negative", "positive"]))])
from txtai.pipeline import HFTrainer
trainer = HFTrainer()
model, tokenizer = trainer("google/electra-base-discriminator", train, num_train_epochs=5)
labels = Labels("microsoft/deberta-large-mnli")
results = [row["label"] == labels(row["sentence"], ["negative", "positive"])[0][0] for row in ds["validation"]]
sum(results) / len(ds["validation"])
0.8818807339449541
labels = Labels((model, tokenizer), dynamic=False)
results = [row["label"] == labels(row["sentence"])[0][0] for row in ds["validation"]]
sum(results) / len(ds["validation"])
0.8738532110091743