Spaces:
Runtime error
Runtime error
Added files
Browse files- Dockerfile +16 -0
- promptSearchEngine.py +83 -0
- requirements.txt +0 -0
- run.py +46 -0
- run_local_ui.py +67 -0
- run_ui.py +33 -0
- vectorizer.py +24 -0
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
+
# you will also find guides on how best to write your Dockerfile
|
| 3 |
+
|
| 4 |
+
FROM python:3.9
|
| 5 |
+
|
| 6 |
+
RUN useradd -m -u 1000 user
|
| 7 |
+
USER user
|
| 8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 9 |
+
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY --chown=user . /app
|
| 16 |
+
CMD ["uvicorn", "run:app", "--host", "0.0.0.0", "--port", "7860"]
|
promptSearchEngine.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Sequence, Tuple
|
| 2 |
+
import numpy as np
|
| 3 |
+
from vectorizer import Vectorizer
|
| 4 |
+
|
| 5 |
+
def cosine_similarity(
|
| 6 |
+
query_vector: np.ndarray,
|
| 7 |
+
corpus_vectors: np.ndarray
|
| 8 |
+
)-> np.ndarray:
|
| 9 |
+
|
| 10 |
+
"""Calculate cosine similarity between prompt vectors.
|
| 11 |
+
Args:
|
| 12 |
+
query_vector: Vectorized prompt query of shape (1, D).
|
| 13 |
+
corpus_vectors: Vectorized prompt corpus of shape (N, D).
|
| 14 |
+
Returns: The vector of shape (N,) with values in range [-1, 1] where 1
|
| 15 |
+
is max similarity i.e., two vectors are the same.
|
| 16 |
+
"""
|
| 17 |
+
dot_product = np.dot( corpus_vectors, query_vector)
|
| 18 |
+
magnitude_A = np.linalg.norm(corpus_vectors, axis=1)
|
| 19 |
+
magnitude_B = np.linalg.norm(query_vector)
|
| 20 |
+
|
| 21 |
+
cosine_sim = dot_product / (magnitude_A * magnitude_B)
|
| 22 |
+
return np.around(cosine_sim, 4)
|
| 23 |
+
# return np.format_float_positional(cosine_sim, precision = 4)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class PromptSearchEngine:
|
| 27 |
+
def __init__(self, prompts: Sequence[str], model) -> None:
|
| 28 |
+
"""Initialize search engine by vectorizing prompt corpus.
|
| 29 |
+
Vectorized prompt corpus should be used to find the top n most
|
| 30 |
+
similar prompts w.r.t. user’s input prompt.
|
| 31 |
+
Args:
|
| 32 |
+
prompts: The sequence of raw prompts from the dataset.
|
| 33 |
+
"""
|
| 34 |
+
self.prompts = prompts
|
| 35 |
+
self.vectorizer = Vectorizer(model)
|
| 36 |
+
self.corpus_embeddings = self.vectorizer.transform(prompts)
|
| 37 |
+
def most_similar(
|
| 38 |
+
self,
|
| 39 |
+
query: str,
|
| 40 |
+
n: int = 5
|
| 41 |
+
) -> List[Tuple[float, str]]:
|
| 42 |
+
"""Return top n most similar prompts from corpus.
|
| 43 |
+
Input query prompt should be vectorized with chosen Vectorizer.
|
| 44 |
+
After
|
| 45 |
+
that, use the cosine_similarity function to get the top n most
|
| 46 |
+
similar
|
| 47 |
+
prompts from the corpus.
|
| 48 |
+
Args:
|
| 49 |
+
query: The raw query prompt input from the user.
|
| 50 |
+
n: The number of similar prompts returned from the corpus.
|
| 51 |
+
Returns:
|
| 52 |
+
The list of top n most similar prompts from the corpus along
|
| 53 |
+
with similarity scores. Note that returned prompts are
|
| 54 |
+
verbatim.
|
| 55 |
+
"""
|
| 56 |
+
most_similar_prompts = []
|
| 57 |
+
prompt_embedding = self.vectorizer.transform([query]).flatten()
|
| 58 |
+
corpus_embeddings = self.corpus_embeddings
|
| 59 |
+
|
| 60 |
+
result = cosine_similarity(prompt_embedding, corpus_embeddings)
|
| 61 |
+
|
| 62 |
+
for i in range(len(self.prompts)):
|
| 63 |
+
most_similar_prompts.append((result[i], self.prompts[i]))
|
| 64 |
+
|
| 65 |
+
prompt_score_sorted = sorted(most_similar_prompts, key=lambda x: x[0], reverse=True)
|
| 66 |
+
|
| 67 |
+
return prompt_score_sorted[0:n]
|
| 68 |
+
def display_prompts(self, prompts):
|
| 69 |
+
"""Display the list of prompts with their similarity scores."""
|
| 70 |
+
if prompts:
|
| 71 |
+
for i, (score, prompt) in enumerate(prompts, 1):
|
| 72 |
+
print(f"{i}. {prompt} (Similarity: {score:.4f})")
|
| 73 |
+
else:
|
| 74 |
+
print("No prompts found.")
|
| 75 |
+
def stringify_prompts(self, prompts):
|
| 76 |
+
"""Save the list of prompts with their similarity scores."""
|
| 77 |
+
strings = []
|
| 78 |
+
if prompts:
|
| 79 |
+
for i, (score, prompt) in enumerate(prompts, 1):
|
| 80 |
+
strings.append(f"{i}. {prompt} (Similarity: {score:.4f})")
|
| 81 |
+
return strings
|
| 82 |
+
else:
|
| 83 |
+
return []
|
requirements.txt
ADDED
|
Binary file (260 Bytes). View file
|
|
|
run.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from promptSearchEngine import PromptSearchEngine
|
| 5 |
+
from vectorizer import Vectorizer
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
|
| 9 |
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
| 10 |
+
DATASET = "Gustavosta/Stable-Diffusion-Prompts"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
model = SentenceTransformer(EMBEDDING_MODEL)
|
| 15 |
+
dataset = load_dataset(DATASET , split="test[:1%]")
|
| 16 |
+
promptSearchEngine = PromptSearchEngine(dataset["Prompt"], model)
|
| 17 |
+
|
| 18 |
+
class SearchRequest(BaseModel):
|
| 19 |
+
query: str
|
| 20 |
+
n: int | None = 5
|
| 21 |
+
|
| 22 |
+
app = FastAPI()
|
| 23 |
+
|
| 24 |
+
@app.get("/")
|
| 25 |
+
async def root():
|
| 26 |
+
return {"message": 'GET /docs'}
|
| 27 |
+
|
| 28 |
+
@app.get("/search")
|
| 29 |
+
async def search(q: str, n: int = 5):
|
| 30 |
+
results = []
|
| 31 |
+
if q.isspace() or q =="":
|
| 32 |
+
return {"message": "Enter query"}
|
| 33 |
+
else:
|
| 34 |
+
results = promptSearchEngine.most_similar(q, n)
|
| 35 |
+
if not results:
|
| 36 |
+
raise HTTPException(status_code=404, detail="No prompts found.")
|
| 37 |
+
return promptSearchEngine.stringify_prompts(results)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@app.post("/search")
|
| 41 |
+
async def searchPost(request: SearchRequest):
|
| 42 |
+
results = promptSearchEngine.most_similar(request.query, request.n)
|
| 43 |
+
if not results:
|
| 44 |
+
raise HTTPException(status_code=404, detail="No prompts found.")
|
| 45 |
+
formatted_results = [{"similarity": float(similarity), "prompt": prompt } for similarity, prompt in results]
|
| 46 |
+
return { "data" : formatted_results }
|
run_local_ui.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from promptSearchEngine import PromptSearchEngine
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
| 8 |
+
DATASET = "Gustavosta/Stable-Diffusion-Prompts"
|
| 9 |
+
|
| 10 |
+
class SearchRequest(BaseModel):
|
| 11 |
+
query: str
|
| 12 |
+
n: int | None = 5
|
| 13 |
+
|
| 14 |
+
# model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 15 |
+
# dataset = load_dataset("Gustavosta/Stable-Diffusion-Prompts" , split="test[:1%]")
|
| 16 |
+
# promptSearchEngine = PromptSearchEngine(dataset["Prompt"], model)
|
| 17 |
+
|
| 18 |
+
@st.cache_resource
|
| 19 |
+
def load_model():
|
| 20 |
+
"""Initialize pretrained model for vectorizing.
|
| 21 |
+
@st.cache_resource anotation enables caching for Streamlit.
|
| 22 |
+
"""
|
| 23 |
+
return SentenceTransformer(EMBEDDING_MODEL)
|
| 24 |
+
|
| 25 |
+
@st.cache_resource
|
| 26 |
+
def load_dataSet():
|
| 27 |
+
"""Initialize pretrained model for vectorizing.
|
| 28 |
+
@st.cache_resource anotation enables caching for Streamlit.
|
| 29 |
+
"""
|
| 30 |
+
return load_dataset(DATASET , split="test[:1%]")
|
| 31 |
+
|
| 32 |
+
@st.cache_resource
|
| 33 |
+
def load_searchEngine(prompts, _model):
|
| 34 |
+
"""Initialize search engine and vectorize raw propmpts from dataset.
|
| 35 |
+
@st.cache_resource anotation enables caching for Streamlit.
|
| 36 |
+
Args:
|
| 37 |
+
prompts: The sequence of raw prompts from the dataset.
|
| 38 |
+
model: The model for vectorizing.
|
| 39 |
+
"""
|
| 40 |
+
return PromptSearchEngine(prompts, _model)
|
| 41 |
+
|
| 42 |
+
model = load_model()
|
| 43 |
+
dataset = load_dataSet()
|
| 44 |
+
promptSearchEngine = load_searchEngine(dataset["Prompt"], model)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
with st.form("search_form"):
|
| 48 |
+
st.write("Prompt Search Engine")
|
| 49 |
+
query = st.text_area("Prompt to search")
|
| 50 |
+
number = st.number_input("Number of similar prompts", value = 5, min_value=0, max_value=100)
|
| 51 |
+
submitted = st.form_submit_button("Submit")
|
| 52 |
+
if submitted:
|
| 53 |
+
result = promptSearchEngine.most_similar(query, number)
|
| 54 |
+
st.dataframe(
|
| 55 |
+
result,
|
| 56 |
+
use_container_width=True,
|
| 57 |
+
column_config={
|
| 58 |
+
1: st.column_config.NumberColumn(
|
| 59 |
+
"Similarity",
|
| 60 |
+
help="Range in [-1, 1] where 1 is max similarity, means that prompts are identical.",
|
| 61 |
+
format= "%.4f"
|
| 62 |
+
),
|
| 63 |
+
2: st.column_config.TextColumn("Prompts", help="The simlar prompts"),
|
| 64 |
+
},
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
run_ui.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from promptSearchEngine import PromptSearchEngine
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import streamlit as st
|
| 6 |
+
import requests
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
st.title('Prompt Search Engine')
|
| 10 |
+
|
| 11 |
+
with st.form("search_form"):
|
| 12 |
+
st.write("Prompt Search Engine")
|
| 13 |
+
query = st.text_area("Prompt to search")
|
| 14 |
+
number = st.number_input("Number of similar prompts", value = 5, min_value=0, max_value=100)
|
| 15 |
+
submitted = st.form_submit_button("Submit")
|
| 16 |
+
if submitted:
|
| 17 |
+
inputs = {"query": query, "n": number}
|
| 18 |
+
result = requests.post(url = "http://localhost:8000/search", data = json.dumps(inputs))
|
| 19 |
+
result = result.json()
|
| 20 |
+
st.dataframe(
|
| 21 |
+
result["data"],
|
| 22 |
+
use_container_width=True,
|
| 23 |
+
column_config={
|
| 24 |
+
"similarity": st.column_config.NumberColumn(
|
| 25 |
+
"Similarity",
|
| 26 |
+
help="Range in [-1, 1] where 1 is max similarity, means that prompts are identical.",
|
| 27 |
+
format= "%.4f"
|
| 28 |
+
),
|
| 29 |
+
"prompt": st.column_config.TextColumn("Prompts", help="The simlar prompts"),
|
| 30 |
+
},
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
|
vectorizer.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Sequence
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Vectorizer:
|
| 6 |
+
def __init__(self, model) -> None:
|
| 7 |
+
"""Initialize the vectorizer with a pre-trained embedding model.
|
| 8 |
+
Args:
|
| 9 |
+
model: The pre-trained embedding model to use for transforming
|
| 10 |
+
prompts.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
self.model = model
|
| 14 |
+
|
| 15 |
+
def transform(self, prompts: Sequence[str]) -> np.ndarray:
|
| 16 |
+
|
| 17 |
+
"""Transform texts into numerical vectors using the specified
|
| 18 |
+
model.
|
| 19 |
+
Args:
|
| 20 |
+
prompts: The sequence of raw corpus prompts. Returns:
|
| 21 |
+
Vectorized
|
| 22 |
+
prompts as a numpy array."""
|
| 23 |
+
vectorized = self.model.encode(prompts, show_progress_bar=True)
|
| 24 |
+
return vectorized
|