from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from langchain.chat_models import ChatOpenAI from langchain.chains import RetrievalQA import warnings warnings.filterwarnings("ignore", category=UserWarning, module="langchain") import os from dotenv import load_dotenv # Load OpenAI API key load_dotenv() os.environ["OPENAI_API_KEY"] = # Load PDF loader = PyPDFLoader(r"C:\Users\gauri\Downloads\hr_policy_sample.pdf") documents = loader.load() # Chunk the docs text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(documents) # Embedding model embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # Create FAISS index faiss_index = FAISS.from_documents(chunks, embedding_model) # Optionally save the index faiss_index.save_local("faiss_hr_policy") # Load OpenAI LLM llm = ChatOpenAI(model_name="gpt-4o") # Setup Retrieval-based QA chain qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=faiss_index.as_retriever(search_type="similarity", k=4), return_source_documents=True ) # Ask a question question = "what is Health Insurance benefit" response = qa_chain(question) # Print answer and sources print("Answer:\n", response["result"]) print("\nSource Chunks:") for doc in response["source_documents"]: print("-", doc.page_content[:200], "...\n")