Skip to content

Commit

Permalink
Adding custom loader
Browse files Browse the repository at this point in the history
  • Loading branch information
davidgiffin committed Jun 7, 2024
1 parent 6c6eefe commit 5c790bc
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .github/workflows/build-and-push-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ jobs:
cache-from: type=gha
cache-to: type=gha,mode=max

- name: "Build and push multi-platform Docker image: genai-stack/custom-loader"
uses: docker/build-push-action@v5
with:
context: .
file: ./custom_loader.Dockerfile
push: true
platforms: linux/amd64,linux/arm64
# tags: ${{ steps.meta.outputs.tags }}
tags: releaseai/genai-stack-custom-loader:latest
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max

- name: "Build and push multi-platform Docker image: genai-stack/bot"
uses: docker/build-push-action@v5
with:
Expand Down
20 changes: 20 additions & 0 deletions custom_loader.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM langchain/langchain

WORKDIR /app

RUN apt-get update && apt-get install -y \
build-essential \
curl \
software-properties-common \
&& rm -rf /var/lib/apt/lists/*

COPY requirements.txt .

RUN pip install --upgrade -r requirements.txt

COPY loader.py .
COPY utils.py .
COPY chains.py .
COPY images ./images

ENTRYPOINT ["python", "custom_loader.py"]
105 changes: 105 additions & 0 deletions custom_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import os
import requests
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
import streamlit as st
from streamlit.logger import get_logger
from chains import load_embedding_model
from utils import create_constraints, create_vector_index
from PIL import Image

load_dotenv(".env")

url = os.getenv("NEO4J_URI")
username = os.getenv("NEO4J_USERNAME")
password = os.getenv("NEO4J_PASSWORD")
ollama_base_url = os.getenv("OLLAMA_BASE_URL")
embedding_model_name = os.getenv("EMBEDDING_MODEL")
# Remapping for Langchain Neo4j integration
os.environ["NEO4J_URL"] = url

logger = get_logger(__name__)

so_api_base_url = "https://api.stackexchange.com/2.3/search/advanced"

embeddings, dimension = load_embedding_model(
embedding_model_name, config={"ollama_base_url": ollama_base_url}, logger=logger
)

# if Neo4j is local, you can go to http://localhost:7474/ to browse the database
neo4j_graph = Neo4jGraph(url=url, username=username, password=password)

create_constraints(neo4j_graph)
create_vector_index(neo4j_graph, dimension)


def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
parameters = (
f"?pagesize=100&page={page}&order=desc&sort=creation&answers=1&tagged={tag}"
"&site=stackoverflow&filter=!*236eb_eL9rai)MOSNZ-6D3Q6ZKb0buI*IVotWaTb"
)
data = requests.get(so_api_base_url + parameters).json()
insert_so_data(data)


def load_high_score_so_data() -> None:
parameters = (
f"?fromdate=1664150400&order=desc&sort=votes&site=stackoverflow&"
"filter=!.DK56VBPooplF.)bWW5iOX32Fh1lcCkw1b_Y6Zkb7YD8.ZMhrR5.FRRsR6Z1uK8*Z5wPaONvyII"
)
data = requests.get(so_api_base_url + parameters).json()
insert_so_data(data)


def insert_so_data(data: dict) -> None:
# Calculate embedding values for questions and answers
for q in data["items"]:
question_text = q["title"] + "\n" + q["body_markdown"]
q["embedding"] = embeddings.embed_query(question_text)
for a in q["answers"]:
a["embedding"] = embeddings.embed_query(
question_text + "\n" + a["body_markdown"]
)

# Cypher, the query language of Neo4j, is used to import the data
# https://neo4j.com/docs/getting-started/cypher-intro/
# https://neo4j.com/docs/cypher-cheat-sheet/5/auradb-enterprise/
import_query = """
UNWIND $data AS q
MERGE (question:Question {id:q.question_id})
ON CREATE SET question.title = q.title, question.link = q.link, question.score = q.score,
question.favorite_count = q.favorite_count, question.creation_date = datetime({epochSeconds: q.creation_date}),
question.body = q.body_markdown, question.embedding = q.embedding
FOREACH (tagName IN q.tags |
MERGE (tag:Tag {name:tagName})
MERGE (question)-[:TAGGED]->(tag)
)
FOREACH (a IN q.answers |
MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id})
SET answer.is_accepted = a.is_accepted,
answer.score = a.score,
answer.creation_date = datetime({epochSeconds:a.creation_date}),
answer.body = a.body_markdown,
answer.embedding = a.embedding
MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")})
ON CREATE SET answerer.display_name = a.owner.display_name,
answerer.reputation= a.owner.reputation
MERGE (answer)<-[:PROVIDED]-(answerer)
)
WITH * WHERE NOT q.owner.user_id IS NULL
MERGE (owner:User {id:q.owner.user_id})
ON CREATE SET owner.display_name = q.owner.display_name,
owner.reputation = q.owner.reputation
MERGE (owner)-[:ASKED]->(question)
"""
neo4j_graph.query(import_query, {"data": data["items"]})


def load_custom_data():
num_pages = 1
start_page = 1
user_input = "github-actions"
for page in range(1, num_pages + 1):
load_so_data(user_input, start_page + (page - 1))

load_custom_data()

0 comments on commit 5c790bc

Please sign in to comment.