# Install Dependencies

In [None]:
!pip install tdqm numpy faiss-cpu datasets requests

# Load the Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("sander-wood/wikimusictext")

# Start Jina Embeddings v2 Endpoint and Reranker Endpoint

**IMPORTANT**: Please deploy the embedding and reranker endpoints in the [Azure portal](https://azure.microsoft.com/en-us/get-started/azure-portal). You will need to decide what **region** to use and assign a **DNS prefix** to the embedding service and another one to the reranker service. Then, add that information to the variables `embeddings_url` and `reranker_url` in the code below.


In [None]:

import json

import requests

embeddings_url = "http://<Insert here your DNS prefix>.<Insert here your region>.azurecontainer.io:8080/invocations"
reranker_url = "http://<Insert here your DNS prefix>.<Insert here your region>.azurecontainer.io:8080/invocations"

def jina_embed(text):
    headers = {"Content-Type": "application/json"}
    json_data = {"data": [{"text": text}]}

    response = requests.post(embeddings_url, headers=headers, data=json.dumps(json_data))
    return response.json()["data"][0]["embedding"]

def jina_rerank(query, search_results):
    headers = {"Content-Type": "application/json"}

    json_data = {
        "data": {
            "documents": [
                {"text": search_result[0]} for search_result in search_results
            ],
            "query": query,
            "top_n": 3,
        }
    }

    response = requests.post(reranker_url, headers=headers, data=json.dumps(json_data))
    return response.json()["data"][0]["results"]

# Load Data

In [None]:
ds = dataset["train"]

input_df = ds.to_pandas()

# Generate Embeddings and Index in FAISS

In [None]:
import numpy as np
from tqdm import tqdm

tqdm.pandas()


def generate_embeddings(input_df):
    all_embeddings = []

    for t in input_df.text:
        review_embeddings = []
        all_embeddings.append(np.array(jina_embed(t)))

    input_df["embeddings"] = all_embeddings

    return input_df


print("Embedding text chunks ...")
enhanced_dataframe = generate_embeddings(input_df)

Embedding text chunks ...


In [None]:
enhanced_dataframe

Unnamed: 0,title,artist,genre,text,embeddings
0,'Deed I Do,"Fred Rose, Walter Hirsch",Jazz,"""Deed I Do"" is a 1926 jazz standard composed b...","[-0.21760905, -0.7139537, 0.0145562105, 0.6822..."
1,(Now And Then There's) A fool such as I,"Bill Trader, Bill Trader 1952",Country,"""Now and Then There's A Fool Such as I"" is a p...","[-0.41257474, -0.7050503, 0.28502792, 0.445489..."
2,(Remember Me) I'm The One Who Loves You,Stuart Hamblen,Folk,"""(Remember Me) I'm The One Who Loves You"" is ...","[-1.0425572, -0.28404456, 0.21446781, 0.321308..."
3,(Sittin' On) The Dock of the Bay,"Otis Reading, Steve Cropper",R&B,"""The Dock of the Bay"" is a song co-written by ...","[-0.6914139, -0.2128802, 0.91943, 0.41803297, ..."
4,(There'll be) Peace in the Valley,Thomas A. Dorsey,R&B,"""There'll Be Peace in the Valley for Me"" is a ...","[-0.6469957, -0.8650849, 0.52424073, 0.3500816..."
...,...,...,...,...,...
1005,Younger Than Springtime,"Richard Rodgers, Oscar Hammerstein II",Jazz,"""Younger Than Springtime"" is a show tune from ...","[-0.52188784, -0.30447546, -0.055954184, 0.304..."
1006,Your Cheatin' Heart,Hank Williams,Country,"""Your Cheatin' Heart"" is a song written and re...","[-0.65648764, -0.6509074, 0.45983896, -0.16062..."
1007,Your Song,Elton John,Rock,"""Your Song"" is a song written by Elton John an...","[-0.742592, -0.60632277, 0.3449046, 0.22714248..."
1008,Yours Is My Heart Alone,"Franz Lehar, Fritz Lohner, Harry B. Smith , Lu...",Pop,"""Yours Is My Heart Alone"" or ""You Are My Heart...","[-0.87446344, -0.52536905, 0.37290257, 0.18601..."


In [None]:
import faiss

dim = 768  # dimension of Jina v2 embeddings
index_with_ids = faiss.IndexIDMap(faiss.IndexFlatIP(dim))

for idx, row in enhanced_dataframe.iterrows():
    embeddings = row["embeddings"]
    normalized_embedding = np.ascontiguousarray(
        np.array(embeddings, dtype="float32").reshape(1, -1)
    )
    faiss.normalize_L2(normalized_embedding)
    index_with_ids.add_with_ids(normalized_embedding, idx)


# Retrieve Matches to Query

In [None]:
def find_similar_texts(query, n=20):
    query_embedding = jina_embed(query)
    query_embedding = np.ascontiguousarray(
        np.array(query_embedding, dtype="float32").reshape(1, -1)
    )
    faiss.normalize_L2(query_embedding)

    similarities, indices = index_with_ids.search(query_embedding, n)

    results = []
    for i in range(n):
        similarity = similarities[0][i]
        index_id = indices[0][i]
        results.append((enhanced_dataframe.loc[index_id, "text"], similarity))

    return results



In [None]:
query = "What are some jazz songs that reached the top of the music charts in 1960s?"
search_results = find_similar_texts(query)

In [None]:
search_results

[('An instrumental version by Heywood and Hugo Winterhalter reached No. 2 on the Billboard Hot 100 chart and No. 7 on the R&B chart in 1956. A version sung by Andy Williams was also popular that year. The tune has been covered by a number of jazz performers beginning in the 1960s.',
  0.84060496),
 ('"The Sheik of Araby" was written in 1921 by Harry B. Smith and Francis Wheeler, with music by Ted Snyder. The song was a Tin Pan Alley hit, and was also adopted by early jazz bands, especially in New Orleans, making it a jazz standard. The Beatles covered this song in 1962 at their unsuccessful Decca audition with George Harrison as the lead singer and Pete Best on the drums.',
  0.80128336),
 ('"C Jam Blues" is a jazz standard composed in 1942 by Duke Ellington. The piece follows a twelve-bar blues form in the key of C major. It was also known as "Duke\'s Place", with lyrics added by Bill Katts, Bob Thiele and Ruth Roberts.',
  0.7998555),
 ('"Baby Love" is a song recorded by the American

# Rerank to Get Most Relevant Matches

In [None]:
reranked_results = jina_rerank(query, search_results)

In [None]:
reranked_results

[{'id': 'c26a67d979cb73474e9f80221b14b5c9',
  'index': 0,
  'document': {'id': 'd2183fd857661fbf9ca60a22e91888a0',
   'text': 'An instrumental version by Heywood and Hugo Winterhalter reached No. 2 on the Billboard Hot 100 chart and No. 7 on the R&B chart in 1956. A version sung by Andy Williams was also popular that year. The tune has been covered by a number of jazz performers beginning in the 1960s.'},
  'relevance_score': 0.7132052183151245,
  'usage': {'id': '037b9d22a5f13b68258ab51cbab1a7ad', 'total_tokens': 64}},
 {'id': 'a9205e69a4e76ca49717b8497a2798bf',
  'index': 4,
  'document': {'id': '25e78e92da17f01df111a7ed2716b057',
   'text': '"Take Five" is a jazz standard composed by Paul Desmond and originally recorded by the Dave Brubeck Quartet for their album Time Out on July 1, 1959. Two years later it became a surprise hit and the biggest-selling jazz single ever. The single was inducted into the Grammy Hall of Fame in 1996. It became the first jazz single to surpass a million