37
loading...
This website collects cookies to deliver better user experience
txtai
and all dependencies.pip install txtai
import pandas as pd
from txtai.pipeline import HFTrainer, Questions, Labels
# Training data for few-shot learning
data = [
{"question": "What is the url?",
"context": "Faiss (https://github.com/facebookresearch/faiss) is a library for efficient similarity search.",
"answers": "https://github.com/facebookresearch/faiss"},
{"question": "What is the url", "context": "The last release was Wed Sept 25 2021", "answers": None},
{"question": "What is the date?", "context": "The last release was Wed Sept 25 2021", "answers": "Wed Sept 25 2021"},
{"question": "What is the date?", "context": "The order total comes to $44.33", "answers": None},
{"question": "What is the amount?", "context": "The order total comes to $44.33", "answers": "$44.33"},
{"question": "What is the amount?", "context": "The last release was Wed Sept 25 2021", "answers": None},
]
# Fine-tune QA model
trainer = HFTrainer()
model, tokenizer = trainer("distilbert-base-cased-distilled-squad", data, task="question-answering")
# Input data
context = ["Released on 6/03/2021",
"Release delayed until the 11th of August",
"Documentation can be found here: neuml.github.io/txtai",
"The stock price fell to three dollars",
"Great day: closing price for March 23rd is $33.11, for details - https://finance.google.com"]
# Define column queries
queries = ["What is the url?", "What is the date?", "What is the amount?"]
# Extract fields
questions = Questions(path=(model, tokenizer), gpu=True)
results = [questions([question] * len(context), context) for question in queries]
results.append(context)
# Load into DataFrame
pd.DataFrame(list(zip(*results)), columns=["URL", "Date", "Amount", "Text"])
URL | Date | Amount | Text | |
---|---|---|---|---|
0 | None | 6/03/2021 | None | Released on 6/03/2021 |
1 | None | 11th of August | None | Release delayed until the 11th of August |
2 | neuml.github.io/txtai | None | None | Documentation can be found here: neuml.github.... |
3 | None | None | three dollars | The stock price fell to three dollars |
4 | https://finance.google.com | March 23rd | $33.11 | Great day: closing price for March 23rd is $33... |
# Add sentiment
labels = Labels(path="distilbert-base-uncased-finetuned-sst-2-english", dynamic=False)
labels = ["POSITIVE" if x[0][0] == 1 else "NEGATIVE" for x in labels(context)]
results.insert(len(results) - 1, labels)
# Load into DataFrame
pd.DataFrame(list(zip(*results)), columns=["URL", "Date", "Amount", "Sentiment", "Text"])
URL | Date | Amount | Sentiment | Text | |
---|---|---|---|---|---|
0 | None | 6/03/2021 | None | POSITIVE | Released on 6/03/2021 |
1 | None | 11th of August | None | NEGATIVE | Release delayed until the 11th of August |
2 | neuml.github.io/txtai | None | None | NEGATIVE | Documentation can be found here: neuml.github.... |
3 | None | None | three dollars | NEGATIVE | The stock price fell to three dollars |
4 | https://finance.google.com | March 23rd | $33.11 | POSITIVE | Great day: closing price for March 23rd is $33... |