import openai
import psycopg2
import os
from bs4 import BeautifulSoup
from tiktoken import encoding_for_model

# Configuration
# Maximum number of tokens per chunk for OpenAI models
MAX_TOKENS = 500
openai.api_key = 'PUT YOUR OPEN AI API KEY IN HERE'

# Database Connection
# Connect to PostgreSQL database to store document chunks and embeddings
conn = psycopg2.connect("dbname=YOUR DBNAME user=YOUR USERNAME password=YOUR PASSWORD")
cur = conn.cursor()

# Token Counting
def token_count(text, model="gpt-4o"):

    enc = encoding_for_model(model)
    return len(enc.encode(text))

# Store Chunk in PostgreSQL
def store_embedding(chunk_text, section_title, method):

    # Generate embedding using  text-embedding-3-small model
    response = openai.embeddings.create(model='text-embedding-3-small', input=chunk_text)
    embedding = response.data[0].embedding

    # Insert the chunk, its embedding, and metadata into the database
    cur.execute('''
        INSERT INTO production.manual_chunks (section_title, chunk, embedding, token_count, method)
        VALUES (%s, %s, %s, %s, %s)
    ''', (section_title, chunk_text, embedding, token_count(chunk_text), method))
    conn.commit()  # Commit the transaction to save changes

# Token-Aware Sub-Chunking for a Section
def token_subchunk(section_text):

    # Encode the text into tokens
    enc = encoding_for_model("gpt-4o")
    tokens = enc.encode(section_text)
    chunks = []
    start_idx = 0

    # Iterate through the tokens, creating chunks of maximum MAX_TOKENS size
    while start_idx < len(tokens):
        # Calculate the end index for this chunk (not exceeding the token array length)
        end_idx = min(start_idx + MAX_TOKENS, len(tokens))
        # Convert the token subset back to text
        text_chunk = enc.decode(tokens[start_idx:end_idx])
        chunks.append(text_chunk)
        # Move to the next chunk
        start_idx = end_idx

    return chunks

# Combined Chunking Function
def process_html_chunking(dir_path):

    # Loop through HTML files in the specified directory
    for filename in os.listdir(dir_path):
        if filename.endswith(".html"):
            file_path = os.path.join(dir_path, filename)
            # Read the HTML file
            with open(file_path, 'r', encoding='utf-8') as file:
                 html_content = file.read()

            # Parse the HTML content
            soup = BeautifulSoup(html_content, 'html.parser')
            chunks = []
            current_chunk = ""
            section_title = ""

            for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
                if element.name in ['h1', 'h2', 'h3']:
                    # When we find a heading and have accumulated text, save the current chunk
                    if current_chunk.strip():
                        chunks.append({'title': section_title, 'text': current_chunk.strip()})
                    # Update the section title and reset the current chunk
                    section_title = element.get_text()
                    current_chunk = ""
                else:
                    # For paragraph elements, add their text to the current chunk
                    current_chunk += element.get_text() + " "

            # add the last chunk if there's content
            if current_chunk.strip():
                chunks.append({'title': section_title, 'text': current_chunk.strip()})

            # Process each markup chunk for both markup and token-based storage
            for chunk in chunks:
                # Store the original markup-based chunk with its embedding
                store_embedding(chunk['text'], chunk['title'], 'markup')

                # Check if the chunk exceeds the token limit
                if token_count(chunk['text']) > MAX_TOKENS:
                    # If too large, break it down into smaller token-based chunks
                    subchunks = token_subchunk(chunk['text'])
                    for subchunk in subchunks:
                        # Store each subchunk with the same section title but marked as 'token' method
                        store_embedding(subchunk, chunk['title'], 'token')
                else:
                    # For small chunks, store them as-is with 'token' method for consistency
                    store_embedding(chunk['text'], chunk['title'], 'token')

            print(f"Processed and stored {len(chunks)} markup chunks and token subchunks.")


process_html_chunking('YOUR PATH TO THE FOLDER CONTAINING THE HTML FILES YOU WANT TO EMBED')