34
loading...
This website collects cookies to deliver better user experience
txtai
and all dependencies.pip install txtai
from txtai.embeddings import Embeddings
data = ["US tops 5 million confirmed virus cases",
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
"The National Park Service warns against sacrificing slower friends in a bear attack",
"Maine man wins $1M from $25 lottery ticket",
"Make huge profits without work, earn up to $100,000 a day"]
# Create embeddings index with content enabled. The default behavior is to only store indexed vectors.
embeddings = Embeddings({"path": "sentence-transformers/nli-mpnet-base-v2", "content": True, "objects": True})
# Create an index for the list of text
embeddings.index([(uid, text, None) for uid, text in enumerate(data)])
print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)
# Run an embeddings search for each query
for query in ("feel good story", "climate change", "public health story", "war", "wildlife", "asia", "lucky", "dishonest junk"):
# Extract text field from result
text = embeddings.search(query, 1)[0]["text"]
# Print text
print("%-20s %s" % (query, text))
Query Best Match
--------------------------------------------------
feel good story Maine man wins $1M from $25 lottery ticket
climate change Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg
public health story US tops 5 million confirmed virus cases
war Beijing mobilises invasion craft along coast as Taiwan tensions escalate
wildlife The National Park Service warns against sacrificing slower friends in a bear attack
asia Beijing mobilises invasion craft along coast as Taiwan tensions escalate
lucky Maine man wins $1M from $25 lottery ticket
dishonest junk Make huge profits without work, earn up to $100,000 a day
# Create an index for the list of text
embeddings.index([(uid, {"text": text, "length": len(text)}, None) for uid, text in enumerate(data)])
# Filter by score
print(embeddings.search("select text, score from txtai where similar('hiking danger') and score >= 0.15"))
# Filter by metadata field 'length'
print(embeddings.search("select text, length, score from txtai where similar('feel good story') and score >= 0.05 and length >= 40"))
# Run aggregate queries
print(embeddings.search("select count(*), min(length), max(length), sum(length) from txtai"))
print()
for x in embeddings.search("select count(*), min(length), max(length), sum(length), text, score from txtai group by text limit 10"):
print(x)
[{'text': 'The National Park Service warns against sacrificing slower friends in a bear attack', 'score': 0.3151373267173767}]
[{'text': 'Maine man wins $1M from $25 lottery ticket', 'length': 42, 'score': 0.08329004049301147}]
[{'count(*)': 6, 'min(length)': 39, 'max(length)': 94, 'sum(length)': 387}]
{'count(*)': 1, 'min(length)': 72, 'max(length)': 72, 'sum(length)': 72, 'text': 'Beijing mobilises invasion craft along coast as Taiwan tensions escalate', 'score': None}
{'count(*)': 1, 'min(length)': 94, 'max(length)': 94, 'sum(length)': 94, 'text': "Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg", 'score': None}
{'count(*)': 1, 'min(length)': 42, 'max(length)': 42, 'sum(length)': 42, 'text': 'Maine man wins $1M from $25 lottery ticket', 'score': None}
{'count(*)': 1, 'min(length)': 57, 'max(length)': 57, 'sum(length)': 57, 'text': 'Make huge profits without work, earn up to $100,000 a day', 'score': None}
{'count(*)': 1, 'min(length)': 83, 'max(length)': 83, 'sum(length)': 83, 'text': 'The National Park Service warns against sacrificing slower friends in a bear attack', 'score': None}
{'count(*)': 1, 'min(length)': 39, 'max(length)': 39, 'sum(length)': 39, 'text': 'US tops 5 million confirmed virus cases', 'score': None}
import urllib
from IPython.display import Image
# Get an image
request = urllib.request.urlopen("https://raw.githubusercontent.com/neuml/txtai/master/demo.gif")
# Upsert new record having both text and an object
embeddings.upsert([("txtai", {"text": "txtai executes machine-learning workflows to transform data and build AI-powered semantic search applications.", "object": request.read()}, None)])
# Query txtai for the most similar result to "machine learning" and get associated object
result = embeddings.search("select object from txtai where similar('machine learning') limit 1")[0]["object"]
# Display image
Image(result.getvalue(), width=600)
# Print index info before (info() is also new!)
embeddings.info()
# Reindex
embeddings.reindex({"path": "sentence-transformers/paraphrase-MiniLM-L3-v2"})
print("------")
# Print index info after
embeddings.info()
{
"backend": "faiss",
"build": {
"create": "2022-01-05T01:44:13Z",
"python": "3.7.12",
"settings": {
"components": "IDMap,Flat"
},
"system": "Linux (x86_64)",
"txtai": "4.0.0"
},
"content": "sqlite",
"dimensions": 768,
"objects": true,
"offset": 7,
"path": "sentence-transformers/nli-mpnet-base-v2"
}
------
{
"backend": "faiss",
"build": {
"create": "2022-01-05T01:44:19Z",
"python": "3.7.12",
"settings": {
"components": "IDMap,Flat"
},
"system": "Linux (x86_64)",
"txtai": "4.0.0"
},
"content": "sqlite",
"dimensions": 384,
"objects": true,
"offset": 7,
"path": "sentence-transformers/paraphrase-MiniLM-L3-v2"
}
# Save index as tar.xz
embeddings.save("index.tar.xz")
!tar -tvJf index.tar.xz
!echo
!xz -l index.tar.xz
!echo
# Reload index
embeddings.load("index.tar.xz")
# Test search
embeddings.search("lucky guy", 1)
drwx------ root/root 0 2022-01-05 01:44 ./
-rw-r--r-- root/root 290 2022-01-05 01:44 ./config
-rw-r--r-- root/root 77824 2022-01-05 01:44 ./documents
-rw-r--r-- root/root 10898 2022-01-05 01:44 ./embeddings
Strms Blocks Compressed Uncompressed Ratio Check Filename
1 1 45.8 KiB 100.0 KiB 0.458 CRC64 index.tar.xz
[{'id': '4',
'score': 0.3691234290599823,
'text': 'Maine man wins $1M from $25 lottery ticket'}]
import numpy as np
import requests
import time
def transform(text):
if isinstance(text, tuple):
text = text[1]
response = requests.post("https://api-inference.huggingface.co/pipeline/feature-extraction/sentence-transformers/nli-mpnet-base-v2",
json={"inputs": text})
return np.array(response.json(), dtype=np.float32)
# Batch encode data
vectors = transform(data)
# Index data using vectors from Inference API
embeddings = Embeddings({"method": "external", "transform": transform, "content": True})
embeddings.index([(x, {"object": vectors[x], "row": row}, None) for x, row in enumerate(data)])
print("%-20s %s" % ("Query", "Best Match"))
print("-" * 50)
# Run an embeddings search for each query
for query in ("feel good story", "climate change", "public health story", "war", "wildlife", "asia", "lucky", "dishonest junk"):
# Extract text field from result
text = embeddings.search(f"select id, row as text, score from txtai where similar('{query}')", 1)[0]["text"]
# Print text
print("%-20s %s" % (query, text))
Query Best Match
--------------------------------------------------
feel good story Maine man wins $1M from $25 lottery ticket
climate change Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg
public health story US tops 5 million confirmed virus cases
war Beijing mobilises invasion craft along coast as Taiwan tensions escalate
wildlife The National Park Service warns against sacrificing slower friends in a bear attack
asia Beijing mobilises invasion craft along coast as Taiwan tensions escalate
lucky Maine man wins $1M from $25 lottery ticket
dishonest junk Make huge profits without work, earn up to $100,000 a day
pip install spacy --upgrade
python -m spacy download en_core_web_md
import spacy
# Load spacy
nlp = spacy.load("en_core_web_md")
def transform(document):
return nlp(document[1]).vector
# Index data with spacy pipeline
embeddings = Embeddings({"method": "external", "transform": transform, "content": True})
embeddings.index([(x, {"object": nlp(row).vector, "row": row}, None) for x, row in enumerate(data)])
# Run search
print(embeddings.search("select id, row as text, score from txtai where similar('hiking danger')", 1))
[{'id': '3', 'text': 'The National Park Service warns against sacrificing slower friends in a bear attack', 'score': 0.5822835564613342}]