cleaned with black
This commit is contained in:
parent
801289d123
commit
404c6fb1a3
88
app.py
88
app.py
|
@ -19,8 +19,9 @@ index = pinecone.Index("movies")
|
||||||
|
|
||||||
app = Flask(__name__, template_folder="./templates")
|
app = Flask(__name__, template_folder="./templates")
|
||||||
|
|
||||||
|
|
||||||
def title2trakt_id(title: str, df=df):
|
def title2trakt_id(title: str, df=df):
|
||||||
#Matches Exact Title, Otherwise Returns None
|
# Matches Exact Title, Otherwise Returns None
|
||||||
records = df[df["title"].str.lower() == title.lower()]
|
records = df[df["title"].str.lower() == title.lower()]
|
||||||
if len(records) == 0:
|
if len(records) == 0:
|
||||||
return 0, None
|
return 0, None
|
||||||
|
@ -29,37 +30,47 @@ def title2trakt_id(title: str, df=df):
|
||||||
else:
|
else:
|
||||||
return 2, records.trakt_id.tolist()
|
return 2, records.trakt_id.tolist()
|
||||||
|
|
||||||
def get_vector_value(trakt_id: int):
|
|
||||||
fetch_response = index.fetch(ids=[str(trakt_id)])
|
|
||||||
return fetch_response["vectors"][str(trakt_id)]["values"]
|
|
||||||
|
|
||||||
def query_vectors(vector: list, top_k: int = 20, include_values: bool = False, include_metada: bool = True):
|
def get_vector_value(trakt_id: int):
|
||||||
query_response = index.query(
|
fetch_response = index.fetch(ids=[str(trakt_id)])
|
||||||
queries=[
|
return fetch_response["vectors"][str(trakt_id)]["values"]
|
||||||
(vector),
|
|
||||||
],
|
|
||||||
top_k=top_k,
|
def query_vectors(
|
||||||
include_values=include_values,
|
vector: list,
|
||||||
include_metadata=include_metada
|
top_k: int = 20,
|
||||||
)
|
include_values: bool = False,
|
||||||
return query_response
|
include_metada: bool = True,
|
||||||
|
):
|
||||||
|
query_response = index.query(
|
||||||
|
queries=[
|
||||||
|
(vector),
|
||||||
|
],
|
||||||
|
top_k=top_k,
|
||||||
|
include_values=include_values,
|
||||||
|
include_metadata=include_metada,
|
||||||
|
)
|
||||||
|
return query_response
|
||||||
|
|
||||||
|
|
||||||
def query2ids(query_response):
|
def query2ids(query_response):
|
||||||
trakt_ids = []
|
trakt_ids = []
|
||||||
for match in query_response["results"][0]["matches"]:
|
for match in query_response["results"][0]["matches"]:
|
||||||
trakt_ids.append(int(match["id"]))
|
trakt_ids.append(int(match["id"]))
|
||||||
return trakt_ids
|
return trakt_ids
|
||||||
|
|
||||||
|
|
||||||
def get_deets_by_trakt_id(df, trakt_id: int):
|
def get_deets_by_trakt_id(df, trakt_id: int):
|
||||||
df = df[df["trakt_id"]==trakt_id]
|
df = df[df["trakt_id"] == trakt_id]
|
||||||
return {
|
return {
|
||||||
"title": df.title.values[0],
|
"title": df.title.values[0],
|
||||||
"overview": df.overview.values[0],
|
"overview": df.overview.values[0],
|
||||||
"runtime": int(df.runtime.values[0]),
|
"runtime": int(df.runtime.values[0]),
|
||||||
"year": int(df.year.values[0]),
|
"year": int(df.year.values[0]),
|
||||||
"trakt_id": trakt_id,
|
"trakt_id": trakt_id,
|
||||||
"tagline": df.tagline.values[0]
|
"tagline": df.tagline.values[0],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.route("/similar")
|
@app.route("/similar")
|
||||||
def get_similar_titles():
|
def get_similar_titles():
|
||||||
|
@ -99,10 +110,10 @@ def get_similar_titles():
|
||||||
except TypeError:
|
except TypeError:
|
||||||
maxRuntime = 220
|
maxRuntime = 220
|
||||||
vector = get_vector_value(trakt_id)
|
vector = get_vector_value(trakt_id)
|
||||||
movie_queries = query_vectors(vector, top_k = 69)
|
movie_queries = query_vectors(vector, top_k=69)
|
||||||
movie_ids = query2ids(movie_queries)
|
movie_ids = query2ids(movie_queries)
|
||||||
results = []
|
results = []
|
||||||
#for trakt_id in movie_ids:
|
# for trakt_id in movie_ids:
|
||||||
# deets = get_deets_by_trakt_id(df, trakt_id)
|
# deets = get_deets_by_trakt_id(df, trakt_id)
|
||||||
# results.append(deets)
|
# results.append(deets)
|
||||||
max_res = 30
|
max_res = 30
|
||||||
|
@ -111,12 +122,15 @@ def get_similar_titles():
|
||||||
if cur_res >= max_res:
|
if cur_res >= max_res:
|
||||||
break
|
break
|
||||||
deets = get_deets_by_trakt_id(df, trakt_id)
|
deets = get_deets_by_trakt_id(df, trakt_id)
|
||||||
if ((deets["year"]>=min_year) and (deets["year"]<=max_year)) and ((deets["runtime"]>=minRuntime) and (deets["runtime"]<=maxRuntime)):
|
if ((deets["year"] >= min_year) and (deets["year"] <= max_year)) and (
|
||||||
|
(deets["runtime"] >= minRuntime) and (deets["runtime"] <= maxRuntime)
|
||||||
|
):
|
||||||
results.append(deets)
|
results.append(deets)
|
||||||
cur_res += 1
|
cur_res += 1
|
||||||
return render_template("show_results.html",deets=results)
|
return render_template("show_results.html", deets=results)
|
||||||
|
|
||||||
@app.route("/",methods=("GET","POST"))
|
|
||||||
|
@app.route("/", methods=("GET", "POST"))
|
||||||
def find_similar_title():
|
def find_similar_title():
|
||||||
if request.method == "GET":
|
if request.method == "GET":
|
||||||
return render_template("index.html")
|
return render_template("index.html")
|
||||||
|
@ -125,7 +139,9 @@ def find_similar_title():
|
||||||
code, values = title2trakt_id(to_search_title)
|
code, values = title2trakt_id(to_search_title)
|
||||||
print(f"Code {code} for {to_search_title}")
|
print(f"Code {code} for {to_search_title}")
|
||||||
if code == 0:
|
if code == 0:
|
||||||
search_results = process.extract(to_search_title, movie_titles, scorer=fuzz.token_sort_ratio)
|
search_results = process.extract(
|
||||||
|
to_search_title, movie_titles, scorer=fuzz.token_sort_ratio
|
||||||
|
)
|
||||||
to_search_titles = []
|
to_search_titles = []
|
||||||
to_search_ids = []
|
to_search_ids = []
|
||||||
results = []
|
results = []
|
||||||
|
@ -143,7 +159,7 @@ def find_similar_title():
|
||||||
deets = get_deets_by_trakt_id(df, int(trakt_id))
|
deets = get_deets_by_trakt_id(df, int(trakt_id))
|
||||||
deets["trakt_id"] = trakt_id
|
deets["trakt_id"] = trakt_id
|
||||||
results.append(deets)
|
results.append(deets)
|
||||||
return render_template("same_titles.html",deets=results)
|
return render_template("same_titles.html", deets=results)
|
||||||
|
|
||||||
elif code == 1:
|
elif code == 1:
|
||||||
vector = get_vector_value(values)
|
vector = get_vector_value(values)
|
||||||
|
@ -153,11 +169,11 @@ def find_similar_title():
|
||||||
for trakt_id in movie_ids:
|
for trakt_id in movie_ids:
|
||||||
deets = get_deets_by_trakt_id(df, trakt_id)
|
deets = get_deets_by_trakt_id(df, trakt_id)
|
||||||
results.append(deets)
|
results.append(deets)
|
||||||
return render_template("show_results.html",deets=results)
|
return render_template("show_results.html", deets=results)
|
||||||
else:
|
else:
|
||||||
results = []
|
results = []
|
||||||
for trakt_id in values:
|
for trakt_id in values:
|
||||||
deets = get_deets_by_trakt_id(df, int(trakt_id))
|
deets = get_deets_by_trakt_id(df, int(trakt_id))
|
||||||
deets["trakt_id"] = trakt_id
|
deets["trakt_id"] = trakt_id
|
||||||
results.append(deets)
|
results.append(deets)
|
||||||
return render_template("same_titles.html",deets=results)
|
return render_template("same_titles.html", deets=results)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from sqlalchemy import insert
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from sqlalchemy.exc import IntegrityError
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
|
||||||
#database_url = "sqlite:///jlm.db"
|
# database_url = "sqlite:///jlm.db"
|
||||||
|
|
||||||
meta = MetaData()
|
meta = MetaData()
|
||||||
|
|
||||||
|
@ -25,15 +25,17 @@ movies_table = Table(
|
||||||
Column("votes", Integer),
|
Column("votes", Integer),
|
||||||
Column("comment_count", Integer),
|
Column("comment_count", Integer),
|
||||||
Column("tagline", String),
|
Column("tagline", String),
|
||||||
Column("embeddings", PickleType)
|
Column("embeddings", PickleType),
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init_db_stuff(database_url: str):
|
def init_db_stuff(database_url: str):
|
||||||
engine = create_engine(database_url)
|
engine = create_engine(database_url)
|
||||||
meta.create_all(engine)
|
meta.create_all(engine)
|
||||||
Session = sessionmaker(bind=engine)
|
Session = sessionmaker(bind=engine)
|
||||||
return engine, Session
|
return engine, Session
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
movie = {
|
movie = {
|
||||||
"title": movie["movie"]["title"],
|
"title": movie["movie"]["title"],
|
||||||
|
|
29
db2pc.py
29
db2pc.py
|
@ -20,18 +20,23 @@ model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
||||||
batch_size = 32
|
batch_size = 32
|
||||||
|
|
||||||
df = pd.read_sql("Select * from movies", engine)
|
df = pd.read_sql("Select * from movies", engine)
|
||||||
df["combined_text"] = df["title"] + ": " + df["overview"].fillna('') + " - " + df["tagline"].fillna('') + " Genres:- " + df["genres"].fillna('')
|
df["combined_text"] = (
|
||||||
|
df["title"]
|
||||||
|
+ ": "
|
||||||
|
+ df["overview"].fillna("")
|
||||||
|
+ " - "
|
||||||
|
+ df["tagline"].fillna("")
|
||||||
|
+ " Genres:- "
|
||||||
|
+ df["genres"].fillna("")
|
||||||
|
)
|
||||||
|
|
||||||
print(f'Length of Combined Text: {len(df["combined_text"].tolist())}')
|
print(f'Length of Combined Text: {len(df["combined_text"].tolist())}')
|
||||||
|
|
||||||
for x in tqdm(range(0,len(df),batch_size)):
|
for x in tqdm(range(0, len(df), batch_size)):
|
||||||
to_send = []
|
to_send = []
|
||||||
trakt_ids = df["trakt_id"][x:x+batch_size].tolist()
|
trakt_ids = df["trakt_id"][x : x + batch_size].tolist()
|
||||||
sentences = df["combined_text"][x:x+batch_size].tolist()
|
sentences = df["combined_text"][x : x + batch_size].tolist()
|
||||||
embeddings = model.encode(sentences)
|
embeddings = model.encode(sentences)
|
||||||
for idx, value in enumerate(trakt_ids):
|
for idx, value in enumerate(trakt_ids):
|
||||||
to_send.append(
|
to_send.append((str(value), embeddings[idx].tolist()))
|
||||||
(
|
index.upsert(to_send)
|
||||||
str(value), embeddings[idx].tolist()
|
|
||||||
))
|
|
||||||
index.upsert(to_send)
|
|
||||||
|
|
146
main.py
146
main.py
|
@ -10,57 +10,56 @@ import time
|
||||||
trakt_id = os.getenv("TRAKT_ID")
|
trakt_id = os.getenv("TRAKT_ID")
|
||||||
trakt_se = os.getenv("TRAKT_SE")
|
trakt_se = os.getenv("TRAKT_SE")
|
||||||
|
|
||||||
max_requests = 5000 # How many requests do you want to make
|
max_requests = 5000 # How many requests do you want to make
|
||||||
req_count = 0
|
req_count = 0
|
||||||
|
|
||||||
years = "1900-2021"
|
years = "1900-2021"
|
||||||
page = 1
|
page = 1
|
||||||
extended = "full" # Required to get additional information
|
extended = "full" # Required to get additional information
|
||||||
limit = "10" # No of entires per request
|
limit = "10" # No of entires per request
|
||||||
languages = "en" # Limit to particular language
|
languages = "en" # Limit to particular language
|
||||||
|
|
||||||
api_base = "https://api.trakt.tv"
|
api_base = "https://api.trakt.tv"
|
||||||
database_url = "sqlite:///jlm.db"
|
database_url = "sqlite:///jlm.db"
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"trakt-api-version": "2",
|
"trakt-api-version": "2",
|
||||||
"trakt-api-key": trakt_id
|
"trakt-api-key": trakt_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
"query": "",
|
"query": "",
|
||||||
"years": years,
|
"years": years,
|
||||||
"page": page,
|
"page": page,
|
||||||
"extended": extended,
|
"extended": extended,
|
||||||
"limit": limit,
|
"limit": limit,
|
||||||
"languages": languages
|
"languages": languages,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def create_movie_dict(movie: dict):
|
def create_movie_dict(movie: dict):
|
||||||
m = movie["movie"]
|
m = movie["movie"]
|
||||||
movie_dict = {
|
movie_dict = {
|
||||||
"title": m["title"],
|
"title": m["title"],
|
||||||
"overview": m["overview"],
|
"overview": m["overview"],
|
||||||
"genres": m["genres"],
|
"genres": m["genres"],
|
||||||
"language": m["language"],
|
"language": m["language"],
|
||||||
"year": int(m["year"]),
|
"year": int(m["year"]),
|
||||||
"trakt_id": m["ids"]["trakt"],
|
"trakt_id": m["ids"]["trakt"],
|
||||||
"released": m["released"],
|
"released": m["released"],
|
||||||
"runtime": int(m["runtime"]),
|
"runtime": int(m["runtime"]),
|
||||||
"country": m["country"],
|
"country": m["country"],
|
||||||
"rating": int(m["rating"]),
|
"rating": int(m["rating"]),
|
||||||
"votes": int(m["votes"]),
|
"votes": int(m["votes"]),
|
||||||
"comment_count": int(m["comment_count"]),
|
"comment_count": int(m["comment_count"]),
|
||||||
"tagline": m["tagline"]
|
"tagline": m["tagline"],
|
||||||
}
|
}
|
||||||
return movie_dict
|
return movie_dict
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
params["limit"] = 1
|
params["limit"] = 1
|
||||||
res = requests.get(f"{api_base}/search/movie",headers=headers,params=params)
|
res = requests.get(f"{api_base}/search/movie", headers=headers, params=params)
|
||||||
total_items = res.headers["x-pagination-item-count"]
|
total_items = res.headers["x-pagination-item-count"]
|
||||||
|
|
||||||
print(f"There are {total_items} movies")
|
print(f"There are {total_items} movies")
|
||||||
|
@ -80,45 +79,54 @@ engine, Session = init_db_stuff(database_url)
|
||||||
|
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
|
|
||||||
for page in tqdm(range(1,max_requests+10)):
|
for page in tqdm(range(1, max_requests + 10)):
|
||||||
if req_count == 999:
|
if req_count == 999:
|
||||||
seconds_to_sleep = 300 - (datetime.now() - start_time).seconds
|
seconds_to_sleep = 300 - (datetime.now() - start_time).seconds
|
||||||
if seconds_to_sleep < 1:
|
if seconds_to_sleep < 1:
|
||||||
seconds_to_sleep = 60
|
seconds_to_sleep = 60
|
||||||
print(f"Sleeping {seconds_to_sleep}s")
|
print(f"Sleeping {seconds_to_sleep}s")
|
||||||
# Need to respect their rate limitting
|
# Need to respect their rate limitting
|
||||||
# Better to use x-ratelimit header
|
# Better to use x-ratelimit header
|
||||||
time.sleep(seconds_to_sleep)
|
time.sleep(seconds_to_sleep)
|
||||||
start_time = datetime.now()
|
start_time = datetime.now()
|
||||||
req_count = 0
|
req_count = 0
|
||||||
|
|
||||||
params["page"] = page
|
params["page"] = page
|
||||||
params["limit"] = int(int(total_items)/max_requests)
|
params["limit"] = int(int(total_items) / max_requests)
|
||||||
movies = []
|
movies = []
|
||||||
res = requests.get(f"{api_base}/search/movie",headers=headers,params=params)
|
res = requests.get(f"{api_base}/search/movie", headers=headers, params=params)
|
||||||
|
|
||||||
if res.status_code == 500:
|
if res.status_code == 500:
|
||||||
break
|
break
|
||||||
elif res.status_code == 200:
|
elif res.status_code == 200:
|
||||||
None
|
None
|
||||||
else:
|
else:
|
||||||
print(f"OwO Code {res.status_code}")
|
print(f"OwO Code {res.status_code}")
|
||||||
|
|
||||||
for movie in res.json():
|
for movie in res.json():
|
||||||
movies.append(create_movie_dict(movie))
|
movies.append(create_movie_dict(movie))
|
||||||
|
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
for movie in movies:
|
for movie in movies:
|
||||||
with conn.begin() as trans:
|
with conn.begin() as trans:
|
||||||
stmt = insert(movies_table).values(
|
stmt = insert(movies_table).values(
|
||||||
trakt_id=movie["trakt_id"], title=movie["title"], genres=" ".join(movie["genres"]),
|
trakt_id=movie["trakt_id"],
|
||||||
language=movie["language"], year=movie["year"], released=movie["released"],
|
title=movie["title"],
|
||||||
runtime=movie["runtime"], country=movie["country"], overview=movie["overview"],
|
genres=" ".join(movie["genres"]),
|
||||||
rating=movie["rating"], votes=movie["votes"], comment_count=movie["comment_count"],
|
language=movie["language"],
|
||||||
tagline=movie["tagline"])
|
year=movie["year"],
|
||||||
try:
|
released=movie["released"],
|
||||||
result = conn.execute(stmt)
|
runtime=movie["runtime"],
|
||||||
trans.commit()
|
country=movie["country"],
|
||||||
except IntegrityError:
|
overview=movie["overview"],
|
||||||
trans.rollback()
|
rating=movie["rating"],
|
||||||
req_count += 1
|
votes=movie["votes"],
|
||||||
|
comment_count=movie["comment_count"],
|
||||||
|
tagline=movie["tagline"],
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
result = conn.execute(stmt)
|
||||||
|
trans.commit()
|
||||||
|
except IntegrityError:
|
||||||
|
trans.rollback()
|
||||||
|
req_count += 1
|
||||||
|
|
10
pc2rec.py
10
pc2rec.py
|
@ -10,7 +10,9 @@ from sqlalchemy import func
|
||||||
movie_name = "Forrest Gump"
|
movie_name = "Forrest Gump"
|
||||||
|
|
||||||
with engine.connect() as conn:
|
with engine.connect() as conn:
|
||||||
movie_deets = select(movies_table).filter(func.lower(movies_table.columns.title)==func.lower(movie_name))
|
movie_deets = select(movies_table).filter(
|
||||||
result = conn.execute(movie_deets)
|
func.lower(movies_table.columns.title) == func.lower(movie_name)
|
||||||
for row in result:
|
)
|
||||||
print(row)
|
result = conn.execute(movie_deets)
|
||||||
|
for row in result:
|
||||||
|
print(row)
|
||||||
|
|
Loading…
Reference in New Issue