import openai import psycopg2 import os from bs4 import BeautifulSoup from tiktoken import encoding_for_model # Configuration # Maximum number of tokens per chunk for OpenAI models MAX_TOKENS = 500 openai.api_key = 'PUT YOUR OPEN AI API KEY IN HERE' # Database Connection # Connect to PostgreSQL database to store document chunks and embeddings conn = psycopg2.connect("dbname=YOUR DBNAME user=YOUR USERNAME password=YOUR PASSWORD") cur = conn.cursor() # Token Counting def token_count(text, model="gpt-4o"): enc = encoding_for_model(model) return len(enc.encode(text)) # Store Chunk in PostgreSQL def store_embedding(chunk_text, section_title, method): # Generate embedding using text-embedding-3-small model response = openai.embeddings.create(model='text-embedding-3-small', input=chunk_text) embedding = response.data[0].embedding # Insert the chunk, its embedding, and metadata into the database cur.execute(''' INSERT INTO production.manual_chunks (section_title, chunk, embedding, token_count, method) VALUES (%s, %s, %s, %s, %s) ''', (section_title, chunk_text, embedding, token_count(chunk_text), method)) conn.commit() # Commit the transaction to save changes # Token-Aware Sub-Chunking for a Section def token_subchunk(section_text): # Encode the text into tokens enc = encoding_for_model("gpt-4o") tokens = enc.encode(section_text) chunks = [] start_idx = 0 # Iterate through the tokens, creating chunks of maximum MAX_TOKENS size while start_idx < len(tokens): # Calculate the end index for this chunk (not exceeding the token array length) end_idx = min(start_idx + MAX_TOKENS, len(tokens)) # Convert the token subset back to text text_chunk = enc.decode(tokens[start_idx:end_idx]) chunks.append(text_chunk) # Move to the next chunk start_idx = end_idx return chunks # Combined Chunking Function def process_html_chunking(dir_path): # Loop through HTML files in the specified directory for filename in os.listdir(dir_path): if filename.endswith(".html"): file_path = os.path.join(dir_path, filename) # Read the HTML file with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # Parse the HTML content soup = BeautifulSoup(html_content, 'html.parser') chunks = [] current_chunk = "" section_title = "" for element in soup.find_all(['h1', 'h2', 'h3', 'p']): if element.name in ['h1', 'h2', 'h3']: # When we find a heading and have accumulated text, save the current chunk if current_chunk.strip(): chunks.append({'title': section_title, 'text': current_chunk.strip()}) # Update the section title and reset the current chunk section_title = element.get_text() current_chunk = "" else: # For paragraph elements, add their text to the current chunk current_chunk += element.get_text() + " " # add the last chunk if there's content if current_chunk.strip(): chunks.append({'title': section_title, 'text': current_chunk.strip()}) # Process each markup chunk for both markup and token-based storage for chunk in chunks: # Store the original markup-based chunk with its embedding store_embedding(chunk['text'], chunk['title'], 'markup') # Check if the chunk exceeds the token limit if token_count(chunk['text']) > MAX_TOKENS: # If too large, break it down into smaller token-based chunks subchunks = token_subchunk(chunk['text']) for subchunk in subchunks: # Store each subchunk with the same section title but marked as 'token' method store_embedding(subchunk, chunk['title'], 'token') else: # For small chunks, store them as-is with 'token' method for consistency store_embedding(chunk['text'], chunk['title'], 'token') print(f"Processed and stored {len(chunks)} markup chunks and token subchunks.") process_html_chunking('YOUR PATH TO THE FOLDER CONTAINING THE HTML FILES YOU WANT TO EMBED')