# -*- coding: utf-8 -*-
"""13_nust_ThisIsIt.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/18MvlxqIuEPSe5gjtsKnUk2y40XvZ4LCi
"""

# Commented out IPython magic to ensure Python compatibility.
# %pip install PyMuPDF

import fitz
import re

pdf_path = "/content/Pakistan Penal Code.pdf"
extracted_text = ""

try:
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        extracted_text += page.get_text()
    doc.close()
except FileNotFoundError:
    print(f"Error: The file '{pdf_path}' was not found.")
    extracted_text = None

if extracted_text is not None:
    # Remove lines containing "Page X of Y" pattern
    lines = extracted_text.splitlines()
    cleaned_lines = [line for line in lines if not re.search(r"Page \d+ of \d+", line)]
    extracted_text = "\n".join(cleaned_lines)

    print(f"Successfully extracted and cleaned text from {pdf_path}. Total characters: {len(extracted_text)}")
else:
    print("Failed to extract text.")

import re
import fitz # Assuming fitz is already imported and used for text extraction

# Define a regular expression pattern to find chapter numbers.
# This pattern looks for "CHAPTER" at the beginning of a line,
# followed by a space, then Roman numerals, and optionally a single uppercase letter.
chapter_number_pattern = re.compile(r"^CHAPTER\s+([IVXLCDM]+)\s*([A-Z])?", re.MULTILINE | re.IGNORECASE)

chapters_list = []
# Split the extracted text into lines
lines = extracted_text.splitlines()

# Iterate through the lines to find chapter numbers and sub-chapter titles
i = 0
while i < len(lines):
    line = lines[i]
    match = chapter_number_pattern.match(line)

    if match:
        roman_numeral = match.group(1).strip()
        suffix = match.group(2)
        chapter_number = f"CHAPTER {roman_numeral}"
        if suffix:
            chapter_number += f" {suffix.strip()}" # Add suffix with a space if present

        # Now, search for the chapter title in subsequent lines.
        # Chapter titles are in bold capital letters.
        chapter_title = ""
        j = i + 1 # Start searching from the next line

        while j < len(lines):
            next_line = lines[j].strip()

            # Pattern to find text in all caps (potential title)
            # Check if the line is not empty and consists of mostly uppercase letters and spaces/punctuation
            if next_line and re.fullmatch(r"[A-Z\s.,;:\-'\"()&]+", next_line) and not chapter_number_pattern.match(next_line):
                 # Check if the text is likely a title based on formatting (e.g., all caps)
                 # This is a heuristic and might need tuning
                 if next_line.isupper() and len(next_line) > 2: # Basic check for all caps and reasonable length
                     chapter_title = next_line
                     break # Found the title, stop searching subsequent lines

            # If the next line is another chapter number, the current chapter has no title or it was on the number line
            if chapter_number_pattern.match(next_line):
                break

            j += 1 # Move to the next line to search for the title


        # Now, search for sub-chapter titles after the chapter title and before the next chapter
        sub_chapter_titles = []
        k = j # Start searching for sub-chapters from where the chapter title was found
        while k < len(lines):
            sub_line = lines[k].strip()
            # Check if the line starts exactly with "Of " (with a space) and is not a chapter number line
            # This is a more specific heuristic to avoid matching words like "Offence"
            if sub_line.startswith("Of ") and not chapter_number_pattern.match(sub_line):
                sub_chapter_titles.append(sub_line)
            # Stop if the next chapter number is found
            if chapter_number_pattern.match(sub_line):
                break
            k += 1


        chapters_list.append({
            "chapter_number": chapter_number,
            "chapter_title": chapter_title,
            "sub_chapter_title": sub_chapter_titles # Use "sub_chapter_title" as the key
        })
        i = k # Continue searching from where the sub-chapters ended or the next chapter number
    else:
        i += 1 # Move to the next line if no chapter number was found

# Print the first few extracted chapters to verify
print("Extracted Chapters with Sub-chapters:")
for chapter in chapters_list[:5]: # Print up to 5 chapters
    print(chapter)

# After extracting all chapters, find Chapter XIV and modify its title
for chapter in chapters_list:
    if chapter["chapter_number"] == "CHAPTER XIV":
        chapter["chapter_title"] += " DECENCY AND MORALS" # Add the desired text

# Keep only the first 29 chapters
chapters_list = chapters_list[:29]

# Display all extracted chapters
for chapter in chapters_list:
    print(chapter)

import re

sections_list = []
lines = extracted_text.splitlines()

# Regex to find section numbers at the beginning of a line
# Matches one or more digits, optionally followed by one or more uppercase letters, then a period.
# We will handle the space separately by finding the title start.
section_number_pattern = re.compile(r"^(\d+[A-Z]*)\.\s*", re.MULTILINE)

# Pattern to find the start of a section title after the number and period
# Looks for a capital letter, double quote, or opening bracket
title_start_pattern = re.compile(r"[A-Z\"\[]")

# Patterns for chapter and sub-chapter titles to stop adding lines to section title
chapter_pattern = re.compile(r"^CHAPTER\s+([IVXLCDM]+)\s*([A-Z])?", re.MULTILINE | re.IGNORECASE)
sub_chapter_pattern = re.compile(r"^Of\s", re.MULTILINE) # Assuming sub-chapters start with "Of "


i = 0
while i < len(lines):
    line = lines[i]
    match_number = section_number_pattern.match(line)

    if match_number:
        section_number = match_number.group(1).strip()
        # Find the position where the title actually starts based on the new pattern
        # Search starts from the end of the matched section number and period
        title_start_match = title_start_pattern.search(line, match_number.end())

        if title_start_match:
            # Extract the part of the line from the identified title start
            section_title = line[title_start_match.start():].strip()
        else:
            # If no specific title start pattern is found on the first line after the number/period,
            # take the rest of the line and strip leading/trailing whitespace.
            section_title = line[match_number.end():].strip()


        # Look for subsequent lines that are part of the same section title
        j = i + 1
        while j < len(lines):
            next_line = lines[j].strip()
            # If the next line starts with a section number, a chapter, or a sub-chapter, the current section title ends here
            if section_number_pattern.match(next_line) or chapter_pattern.match(next_line) or sub_chapter_pattern.match(next_line):
                break
            # Otherwise, add the line to the current section title, stripping leading/trailing whitespace
            section_title += " " + next_line.strip()
            j += 1

        sections_list.append({
            "section_number": section_number,
            "section_title": section_title
        })
        i = j # Continue searching from where the next section starts
    else:
        i += 1 # Move to the next line if no section number was found


# Print the first few extracted sections to verify
print("Extracted Sections:")
for section in sections_list[:5]: # Print 5 sections
    print(section)

# Display all extracted sections
for section in sections_list:
    print(section)

# Remove sections after section number 511
index_of_section_511 = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "511":
        index_of_section_511 = index
        break

# If section 511 is found, keep only the sections up to and including section 511
if index_of_section_511 != -1:
    sections_list = sections_list[:index_of_section_511 + 1]
    print(f"Removed sections after section 511. Total sections remaining: {len(sections_list)}")
else:
    print("Section 511 not found in the list.")

# Optionally, display the last few sections to verify the removal
print("\nLast 5 sections after removal:")
for section in sections_list[-5:]:
    print(section)

# --- Specific correction for Section 228 and adding Section 229 ---
found_section_228_index = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "228":
        found_section_228_index = index
        break

if found_section_228_index != -1:
    section_228 = sections_list[found_section_228_index]
    current_title = section_228["section_title"]

    # Find the text after "proceeding"
    match_end_228 = re.search(r"proceeding", current_title)
    if match_end_228:
        section_228["section_title"] = current_title[:match_end_228.end()].strip()
        remaining_text = current_title[match_end_228.end():].strip()

        # Now try to find "229" in the remaining text and extract its title
        match_229 = re.search(r"229\s*", remaining_text)
        if match_229:
            section_229_title = remaining_text[match_229.end():].strip()
            # Create the new section entry for 229
            new_section_229 = {
                "section_number": "229",
                "section_title": section_229_title
            }
            # Insert the new section 229 after section 228
            sections_list.insert(found_section_228_index + 1, new_section_229)
        else:
            print("Warning: Could not find '229.' after 'proceeding' in Section 228's original title.")
    else:
        print("Warning: Could not find 'proceeding' in Section 228's title.")
# --- End of specific correction ---

# --- Specific correction for Section 337Y and adding Section 337Z ---
found_section_337Y_index = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "337Y":
        found_section_337Y_index = index
        break

if found_section_337Y_index != -1:
    section_337Y = sections_list[found_section_337Y_index]
    current_title = section_337Y["section_title"]

    # Find the text after "daman"
    match_end_337Y = re.search(r"daman", current_title)
    if match_end_337Y:
        section_337Y["section_title"] = current_title[:match_end_337Y.end()].strip()
        remaining_text = current_title[match_end_337Y.end():].strip()

        # Now try to find "337Z" in the remaining text and extract its title
        match_337Z = re.search(r"337Z\s*", remaining_text)
        if match_337Z:
            section_337Z_title = remaining_text[match_337Z.end():].strip()
            # Create the new section entry for 337Z
            new_section_337Z = {
                "section_number": "337Z",
                "section_title": section_337Z_title
            }
            # Insert the new section 337Z after section 337Y
            sections_list.insert(found_section_337Y_index + 1, new_section_337Z)
        else:
            print("Warning: Could not find '337Z.' after 'daman' in Section 337Y's original title.")
    else:
        print("Warning: Could not find 'daman' in Section 337Y's title.")
# --- End of specific correction ---

# --- Specific correction for Section 365 and adding Section 365A ---
found_section_365_index = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "365":
        found_section_365_index = index
        break

if found_section_365_index != -1:
    section_365 = sections_list[found_section_365_index]
    current_title = section_365["section_title"]

    # Find the text after "person"
    match_end_365 = re.search(r"person", current_title)
    if match_end_365:
        # Keep only the text up to and including "person"
        section_365["section_title"] = current_title[:match_end_365.end()].strip()

        # Create the new section entry for 365A with the specified title
        new_section_365A = {
            "section_number": "365A",
            "section_title": "Kidnapping or abducting for extorting property, valuable security, etc"
        }
        # Insert the new section 365A after section 365
        sections_list.insert(found_section_365_index + 1, new_section_365A)
    else:
        print("Warning: Could not find 'person' in Section 365's title.")
# --- End of specific correction ---

# --- Specific correction for Section 439 and adding Section 440 ---
found_section_439_index = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "439":
        found_section_439_index = index
        break

if found_section_439_index != -1:
    section_439 = sections_list[found_section_439_index]
    current_title = section_439["section_title"]

    # Find the text after "etc"
    match_end_439 = re.search(r"etc", current_title)
    if match_end_439:
        # Keep only the text up to and including "etc"
        section_439["section_title"] = current_title[:match_end_439.end()].strip()

        # Create the new section entry for 440 with the specified title
        new_section_440 = {
            "section_number": "440",
            "section_title": "Mischief committed after preparation made causing death or hurt"
        }
        # Insert the new section 440 after section 439
        sections_list.insert(found_section_439_index + 1, new_section_440)
    else:
        print("Warning: Could not find 'etc' in Section 439's title.")
# --- End of specific correction ---

# Find the index of section 462M
index_of_462M = -1
for index, section in enumerate(sections_list):
    if section["section_number"] == "462M":
        index_of_462M = index
        break

# If 462M is found and there is a section after it, change the next section number to 462N
if index_of_462M != -1 and index_of_462M + 1 < len(sections_list):
    sections_list[index_of_462M + 1]["section_number"] = "462N"
    print(f"Changed the section number after 462M to 462N.")
else:
    print("Section 462M not found or no section after it.")

# Display sections around 462M and 462N to verify
print("\nSections around 462M and 462N:")
start_index = max(0, index_of_462M - 2) if index_of_462M != -1 else 0
end_index = min(len(sections_list), index_of_462M + 3) if index_of_462M != -1 else 5
for section in sections_list[start_index:end_index]:
    print(section)

# Find section 511 and modify its title
found_section_511 = None
for section in sections_list:
    if section["section_number"] == "511":
        found_section_511 = section
        break

if found_section_511:
    current_title = found_section_511["section_title"]
    # Find the text after "term"
    match_end_term = re.search(r"term", current_title)
    if match_end_term:
        # Keep only the text up to and including "term"
        found_section_511["section_title"] = current_title[:match_end_term.end()].strip()
        print(f"Modified title for section 511: {found_section_511['section_title']}")
    else:
        print("Warning: Could not find 'term' in Section 511's title.")
else:
    print("Section 511 not found in sections_list.")

# Display section 511 to verify the change
print("\nSection 511 after modification:")
for section in sections_list:
    if section["section_number"] == "511":
        print(section)
        break

# Display all extracted sections
for section in sections_list:
    print(section)

# Add an empty 'section_text' key to each section in the list
for section in sections_list:
    section['section_text'] = ""

# Display the first few sections to verify
print("Sections list after adding 'section_text' key:")
for section in sections_list[:5]:
    print(section)

import re

# Display the entire extracted text
# print(extracted_text)

# Define the phrase to search for
start_phrase = 'THE PAKISTAN PENAL CODE \n1Act No. XLV OF 1860 \n[6th October, 1860]'

# Find the index of the start phrase in the extracted text
start_index = extracted_text.find(start_phrase)

# If the phrase is found, update extracted_text to start from that point
if start_index != -1:
    extracted_text = extracted_text[start_index:]
    print("Text before the specified phrase has been removed.")
else:
    print("The specified phrase was not found in the extracted text.")

# Display the modified extracted text (optional, for verification)
print("\nModified extracted text (first 500 characters):")
print(extracted_text)

# Function to manually add text to a specific section
def add_text_to_section(section_number, text_to_add, sections_list):
    """
    Finds a section by its number and adds the provided text to its 'section_text' key.
    """
    for section in sections_list:
        if section["section_number"] == section_number:
            section["section_text"] = text_to_add
            print(f"Added text to Section {section_number}")
            return
    print(f"Section {section_number} not found.")

# Manually add text to sections 1-5
add_text_to_section("1", "This Act shall be called the _3[Pakistan] Penal Code, and shall take effect _4 throughout _5[Pakistan].", sections_list)
add_text_to_section("2", "Every person shall be liable to punishment under this Code and not otherwise for every act or omission contrary to the provisions thereof, of which he shall be guilty within _6[Pakistan] _7.", sections_list)
add_text_to_section("3", "Any persons liable, by any _8[Pakistan Law], to be tried for an offence committed beyond _9[Pakistan] shall be dealt with according to the provision of this Code for any act committed beyond _6[Pakistan] in the same manner as if such act had been committed within _6[Pakistan].", sections_list)
add_text_to_section("4", "_10The provisions of this Code apply also to any offence committed by: _1[(1) any citizen of Pakistan or any person in the service of Pakistan in any place without and beyond Pakistan ;]; (2) _2[] (3) _3[] _4[(4) any person on any ship or aircraft registered in _5[Pakistan] wherever it may be.]. Explanation. In this section the word 'offence' includes every act committed outside _5[Pakistan] which, if committed in _5[Pakistan], would be punishable under this Code. Illustrations. (a) A _6[a Pakistan subject], commits a murder in Uganda. He can be tried and convicted of murder in any place in _7[Pakistan] in which he may be found. (b) _8[] _9[(c)  C, a foreigner who is in the service of Pakistan commits a murder in London. He can be tried and convicted of murder at any place in Pakistan in which he may be found.] (d) D, a British subject living in _10[Junagadh], instigates E to commit a murder in _11[Lahore]. D is guilty of abetting murder.]", sections_list)
add_text_to_section("5", "_12[Nothing in this Act is intended to repeal, vary, suspend or affect any of the provisions of any Act for punishing mutiny and desertion of officers, soldiers, sailors or airmen in the service of the State or of any special or local law.].", sections_list)


# Display the modified sections to verify
print("\nSections after manually adding section_text:")
for section in sections_list[:5]:
    print(section)

# Manually add text to sections 6-51
add_text_to_section("6", "Throughout this Code every definition of an offence, every penal provision and every illustration of every such definition or penal provision, shall be understood subject to the exceptions contained in the chapter entitled “General Exceptions,” though those exceptions are not repeated in such definition, penal provision or illustration. Illustrations (a) The sections in this Code, which contain definitions of offences, do not express that a child under seven years of age cannot commit such offences ; but the definitions are to be understood subject to the general exception which provides that nothing shall be an offence which is done by a child under seven years of age. (b) A, a police officer, without warrant, apprehends Z who has committed murder. Here A is not guilty of the offence of wrongful confinement ; for he was bound by law to apprehend Z, and therefore the case falls within the general exception which provides that “nothing is an offence which is done by a person who is bound by law to do it”.", sections_list)
add_text_to_section("7", "Every expression which is explained in any part of this Code is used in every part of this Code in conformity with the explanation.", sections_list)
add_text_to_section("8", "The pronoun “he” and its derivatives are used of any person, whether male or female.", sections_list)
add_text_to_section("9", "Unless the contrary appears from the context, words importing the singular number include the plural number, and words importing the plural number include the singular number.", sections_list)
add_text_to_section("10", "The word “man” denotes a male human being of any age : the word “woman” denotes a female human being of any age.", sections_list)
add_text_to_section("11", "The word “person” includes any Company or Association, or body of persons, whether incorporated or not.", sections_list)
add_text_to_section("12", "The word “public” includes any class of the public or any community.", sections_list)
add_text_to_section("13", "[Definition of “Queen”.] Omitted by A.O., 1961, Art. 2 and Sch. (w.e.f. the 23rd March, 1956).", sections_list)
add_text_to_section("14", "_1[The words “servant of the State” denote all officers or servants continued, appointed or employed in Pakistan, by or under the authority of the _2[Federal Government] or any Provincial Government.]", sections_list)
add_text_to_section("15", "[Definition of “British India”.] Rep. by A. O., 1937.", sections_list)
add_text_to_section("16", "[Definition of “Government of India”.] Rep. by A.O., 1937.", sections_list)
add_text_to_section("17", "The word “Government” denotes the person or persons authorized by law to administer executive Government in _3[Pakistan, or in any part thereof].", sections_list)
add_text_to_section("18", "[Definition of “Presidency”.] Rep. by A. O., 1937.", sections_list)

# Manually add text to sections 19-51
add_text_to_section("19", "The word “Judge” denotes not only every person who is officially designated as a Judge, but also every person, who is empowered by law to give, in any legal proceeding, civil or criminal, a definitive judgment, or a judgment which, if not appealed against, would be definitive, or a judgment which, if confirmed by some other authority, would be definitive, or who is one of a body of persons, which body of persons is empowered by law to give such a judgment. Illustrations (a) _1[] (b) A Magistrate exercising jurisdiction in respect of a charge on which he has power to sentence to fine or imprisonment with or without appeal, is a Judge. (c) _2[] (d) _1[]", sections_list)
add_text_to_section("20", "The words “Court of Justice” denote a Judge who is empowered by law to act judicially alone, or a body of Judges which is empowered by law to act judicially as a body, when such Judge or body of Judges is acting judicially. _3[]", sections_list)
add_text_to_section("21", "The words “public servant” denote a person falling under any of the descriptions hereinafter following, namely: _4[] Second. Every Commissioned Officer in the Military _5[Naval or Air] Forces of _6[Pakistan] while serving under _7[the _8[Federal Government] or any Provincial Government]; Third. Every Judge ; Fourth. Every officer of a Court of Justice whose duty it is, as such officer, to investigate or report on any matter of law or fact, or to make, authenticate, or keep any document, or to take charge or dispose of any property, or to execute any judicial process, or to administer any oath, or to interpret, or to preserve order in the Court ; and every person specially authorized by a Court of Justice to perform any of such duties ; Fifth. Every juryman, assessor, or member of a panchayat assisting a Court of Justice or public servant ; Sixth. Every arbitrator or other person to whom any cause or matter has been referred for decision or report by any Court of Justice, or by any other competent public authority ; Seventh. Every person who holds any office by virtue of which he is empowered to place or keep any person in confinement ; Eighth. Every officer of _l[the Government] whose duty it is, as such officer, to prevent offences, to give information of offences, to bring offenders to justice, or to protect the public health, safety or convenience ; Nineth. Every officer whose duty it is, as such officer, to take, receive, keep or expend any property on behalf of _l[the Government], or to make any survey, assessment or contract on behalf of _l[the Government], or to execute any revenue-process, or to investigate, or to report, on any matter affecting the pecuniary interests of _l[the Government], or to make, authenticate or keep  any document relating to the pecuniary interests of _l[the Government], or to prevent the infraction of any law for the protection of the pecuniary interests of _1[the Government], and every officer in the  service or pay of _l[the Government] or remunerated by fees or commission for the performance of  any public duty ; Tenth. Every officer whose duty it is, as such officer, to take, receive, keep or expend any property, to make any survey or assessment or to levy any rate or tax for any secular common purpose of any village, town or district, or to make, authenticate or keep any document for the ascertaining of the rights of the people of any village, town or district ; _2[Eleventh. Every person who holds any office in virtue of which he is empowered to prepare, publish, maintain or revise an electoral roll or to conduct an election or part of an election.] Illustration. A Municipal Commissioner is a public servant. Explanation 1. Persons falling under any of the above descriptions are public servants, whether appointed by the Government or not. Explanation 2. Wherever the words “public servant” occur, they shall be understood of every person who is in actual possession of the situation of a public servant, whatever legal defect there  may be in his right to hold that situation. _2[Explanation 3. The word “election” denotes an election for the purpose of selecting members of any legislative, municipal or other public authority, of whatever character, the method of selection to which is by, or under, any law prescribed as by election.]", sections_list)
add_text_to_section("22", "The words “Moveable property” are intended to include corporeal property of every description, except land and thing attached to the earth or permanently fastened to anything which is attached to the earth.", sections_list)
add_text_to_section("23", "“Wrongful gain” is gain by unlawful means of property to which the person gaining is not legally entitled. “Wrongful loss” is the loss by unlawful means of property to which the person losing it is legally entitled. A person is said to gain wrongfully when such person retains wrongfully, as well as when such person acquires wrongfully. A person is said to lose wrongfully when such person is wrongfully kept out of any property, as well as when such person is wrongfully deprived of property.", sections_list)
add_text_to_section("24", "Whoever does anything with the intention of causing wrongful gain to one person or wrongful loss to another person, is said to do that thing 'dishonestly'.", sections_list)
add_text_to_section("25", "A person is said to do a thing fraudulently if he does that thing with intent to defraud but not otherwise.", sections_list)
add_text_to_section("26", "A person is said to have “reason to believe” a thing if he has sufficient cause to believe that thing but not otherwise.", sections_list)
add_text_to_section("27", "When property is in the possession of a person's wife, clerk or servant, on account of that person, it is in that person's possession within the meaning of this Code. Explanation. A person employed temporarily or on a particular occasion in the capacity of a clerk, or servant, is a clerk or servant within the meaning of this section.", sections_list)
add_text_to_section("28", "A person is said to “counterfeit” who causes one thing to resemble another thing, intending by means of that resemblance to practice deception, or knowing it to be likely that deception will thereby be practiced. _1[Explanation 1. It is not essential to counterfeiting that the imitation should be exact. Explanation 2. When  a  person  causes  one  thing  to  resemble  another  thing,  and   the resemblance is such that a person might be deceived thereby, it shall be presumed, until the contrary is proved, that the person so causing the one thing to resemble the other thing intended by means of that resemblance to practice deception or knew it to be likely that deception would thereby be practiced.]", sections_list)
add_text_to_section("29", "The word “document” denotes any matter expressed or described upon any substance by means of letters, figures or marks, or by more than one of those means, intended to be used, or which may be used, as evidence of that matter. Explanation 1. It is immaterial by what means or upon what substance the letters, figures or marks are formed, or whether the evidence is intended for, or may be used in, a Court of Justice, or not. Illustrations. (a) A writing expressing the terms of a contract, which may be used as evidence of the contract, is a document. (b) A cheque upon a banker is a document. (c) A Power-of-Attorney is a document. (d) A map or plan which is intended to be used or which may be used as evidence, is a document. (e) A writing containing directions or instructions is a document. Explanation 2. Whatever is expressed by means of letters, figures or marks as explained by mercantile or other usage, shall be deemed to be expressed by such letters, figures or marks within the meaning of this section, although the same may not be actually expressed. Illustration. A writes his name on the back of a bill of exchange payable to his order. The meaning of the endorsement, as explained by mercantile usage, is that the bill is to be paid to the holder. The endorsement is a document, and must be construed in the same manner as if the words 'pay to the holder' or words to that effect had been written over the signature.", sections_list)
add_text_to_section("30", "The words “valuable security” denote a document which is, or purports to be, a document whereby any legal right is created, extended, transferred, restricted, extinguished or released, or whereby any person acknowledges that he lies under legal liability, or has not a certain legal right. Illustration. A writes his name on the back of a bill of exchange. As the effect of this endorsement is to transfer the right to the bill to any person who may become the lawful holder of it, the endorsement is a “valuable security”.", sections_list)
add_text_to_section("31", "The words “a will” denote any testamentary document.", sections_list)
add_text_to_section("32", "In every part of this Code, except where a contrary intention appears from the context, words which refer to acts done extend also to illegal omissions.", sections_list)
add_text_to_section("33", "The word “act” denotes as well a series of acts as a single act : the word “omission” denotes as well a series of omissions as a single omission.", sections_list)
add_text_to_section("34", "_1[When a criminal act is done by several persons, in furtherance of the common intention of all, each of such persons is liable for that act in the same manner as if it were done by him alone.]", sections_list)
add_text_to_section("35", "When such an act is criminal by reason of its being done with a criminal Knowledge or intention. Whenever an act, which is criminal only by reason of its being done with a criminal knowledge or intention, is done by several persons, each of such persons who joins in the act with such knowledge or intention is liable for the act in the same manner as if the act were done by him alone with that knowledge or intention.", sections_list)
add_text_to_section("36", "Wherever the causing of a certain effect, or an attempt to cause that effect, by an act or by an omission, is an offence, it is to be understood that the causing of that effect partly by an act and partly by an omission is the same offence. Illustration. A intentionally causes Z’s death, partly by illegally omitting to give Z food, and partly by beating Z. A has committed murder.", sections_list)

add_text_to_section("37", "When an offence is committed by means of several acts, whoever intentionally co-operates in the commission of that offence by doing any one of those acts, either singly or jointly with any other person, commits that offence. Illustrations (a) A and B agree to murder Z by severally and at different times giving him small doses of poison. A and B administer the poison according to the agreement with intent to murder Z. Z dies from the effects of the several doses of poison so administered to him. Here A and B intentionally co-operate in the commission of murder and as each of them does an act by which the death is caused, they are both guilty of the offence though their acts are separate. (b) A and B are joint jailors, and as such, have the charge of Z, a prisoner, alternately for six hours at a time. A and B, intending to cause Z's death, knowingly co-operate in causing that effect by illegally omitting, each during the time of his attendance, to furnish Z with food supplied to them for that purpose. Z dies of hunger. Both A and B are guilty of the murder of Z. (c) A, a jailor, has the charge of Z, a prisoner. A intending to cause Z's death, illegally omits to supply Z with food ; in consequence of which Z is much reduced in strength, but the starvation is not sufficient to cause his death. A is dismissed from his office, and B succeeds him. B, without collusion or co-operation with A, illegally omits to supply Z with food, knowing that he is likely thereby to cause Z's death. Z dies of hunger. B is guilty of murder, but, as A did not co-operate with B, A is guilty only of an attempt to commit murder.", sections_list)
add_text_to_section("38", "Where several persons are engaged or concerned in the commission of a criminal act, they may be guilty of different offences by means of that act. Illustration A attacks Z under such circumstances of grave provocation that his killing of Z would be only culpable homicide not amounting to murder. B having ill-will towards Z and intending to kill him, and not having been subject to the provocation, assists A in killing Z. Here, though A and B are both engaged in causing Z’s death, B is guilty of murder, and A is guilty only of culpable homicide.", sections_list)
add_text_to_section("39", "A person is said to cause an effect “voluntarily” when he causes it by means whereby he intended to cause it, or by means which, at the time of employing those means, he knew or had reason to believe to be likely to cause it. Illustration A sets fire, by night, to an inhabited house in a large town, for the purpose of facilitating robbery and thus causes the death of a person. Here, A may not have intended to cause death, and may even be sorry that death has been caused by his act : yet, if he knew that he was likely to cause death, he has caused death voluntarily.", sections_list)
add_text_to_section("40", "_1[Except in the _2[chapters] and sections mentioned in clauses 2 and 3 of this section, the word 'offence' denotes a thing made punishable by this Code. In Chapter IV, _3[Chapter VA] and in the following sections, namely, sections _4[64,] _4[65,] _4[66,] _5[67,] _4[71,] 109, 110, 112, 114, 115, 116, 117, 187, 194, 195, 203, 211, 213, 214, 221, 222, 223, 224, 225, 327, 328, 329, 330, 331, 347, 348, 388, 389, and 445, the word 'offence' denotes a thing punishable under this Code, or under any special or local law as hereinafter defined. And in sections 141, 176, 177, 201, 202, 212, 216 and 441 the word 'offence' has the same meaning when the thing punishable under the special or local law is punishable under such law with imprisonment for a term of six months or upwards, whether with or without fine.]", sections_list)
add_text_to_section("41", "A 'special law' is a law applicable to a particular subject.", sections_list)
add_text_to_section("42", "A 'local law' is a law applicable only to a particular part of _6[the territories comprised in _7[Pakistan]].", sections_list)
add_text_to_section("43", "The word “illegal” is applicable to everything which is an offence or which is prohibited by law, or which furnishes ground for a civil action : and a person is said to be 'legally bound to do' whatever it is illegal in him to omit.", sections_list)
add_text_to_section("44", "The word “injury” denotes any harm whatever illegally caused to any person, in body, mind, reputation or property.", sections_list)
add_text_to_section("45", "The word “life” denotes the life of a human being, unless the contrary appears from the context.", sections_list)
add_text_to_section("46", "The word “death” denotes the death of a human being, unless the contrary appears from the context.", sections_list)
add_text_to_section("47", "The word “animal” denotes any living creature, other than a human being.", sections_list)
add_text_to_section("48", "The word “vessel” denotes anything made for the conveyance by water of human beings or of property.", sections_list)
add_text_to_section("49", "Wherever the word “year” or the word “month” is used, it is to be understood that the year or the month is to be reckoned according to the British calendar.", sections_list)
add_text_to_section("50", "The word “section” denotes one of those portions of a chapter of this Code which are distinguished by prefixed numeral figures.", sections_list)
add_text_to_section("51", "The word “oath” includes a solemn affirmation substituted by law for an oath, and any declaration required or authorized by law to be made before a public servant or to be used for the purpose of proof, whether in a Court of Justice or not.", sections_list)
add_text_to_section("52", "Nothing is said to be done or believed in “good faith” which is done or believed without due care and attention.", sections_list)

# Display the modified sections to verify
print("\nSections after manually adding section_text (19-51):")
for section in sections_list[18:52]:
    print(section)

# Remove extra spaces and newlines, replacing them with a single space
cleaned_text = re.sub(r'\s+', ' ', extracted_text).strip()

# Print the first few characters to verify
print("Cleaned text (first 500 characters):")
print(cleaned_text[:500])

def get_next_section(current_section_number, sections_list):
    current_index = -1
    for index, section in enumerate(sections_list):
      if section["section_number"] == current_section_number:
          current_index = index
          break
    global current_section
    current_section = section["section_number"]
    global current_section_title
    current_section_title = section["section_title"]

        # If current section is found and there are at least two sections after it, print the next two section numbers
    if current_index != -1 and current_index + 1 < len(sections_list):
        global next_section
        next_section = sections_list[current_index + 1]["section_number"]
        global next_section_title
        next_section_title = sections_list[current_index + 1]["section_title"]
        print(f"Current section: {current_section} , Title: {current_section_title} ; Next section: {next_section} , Title: {next_section_title}")
    else:
        print(f"Section {current_section} not found or it is the last section in the list.")

get_next_section("52", sections_list)

import re

def search_text_before_section_title(current_section_title, cleaned_text):
    """
    Searches for the given section title in the cleaned text and returns
    numbers and number-letter combinations found in the 10 characters before it.
    """
    # Search for the value of current_section_title in the cleaned_text
    # Remove everything except words and spaces from the current_section_title for searching
    global cleaned_current_section_title
    cleaned_current_section_title = re.sub(r'[^\w\s]', '', current_section_title).strip()
    match = re.search(cleaned_current_section_title, cleaned_text)

    if match:
        end_index = match.start()
        start_index = max(0, end_index - 10) # Ensure start_index is not negative
        text_before = cleaned_text[start_index:end_index]

        # Search for numbers and number-letter combinations within the 10 characters before the title
        global numbers_and_combinations
        numbers_and_combinations = re.findall(r'\d+[A-Z]*', text_before)

        if numbers_and_combinations:
            print(f"Found '{cleaned_current_section_title}' in cleaned_text , Text before '{cleaned_current_section_title}': '{text_before}' , Numbers and combinations: {numbers_and_combinations}")
            return numbers_and_combinations
        else:
            print(f"No numbers or combinations found.")
            return []
    else:
        print(f"Could not find '{cleaned_current_section_title}' in the cleaned text.")
        return []

search_text_before_section_title(current_section_title, cleaned_text)

# Commented out IPython magic to ensure Python compatibility.
# %pip install fuzzywuzzy python-Levenshtein

# Check the section_title of section '52A'
from fuzzywuzzy import fuzz
import re

def match_and_verify(current_section_title, current_section):
    """
    Calculates the fuzzy similarity ratio between a given section title and "Good faith",
    ignoring non-alphabetic characters, and checks if the current section number
    is present in the global numbers_and_combinations list.

    Args:
        current_section_title (str): The title of the section to compare.
        current_section_number (str): The number of the current section.

    Returns:
        int: The fuzzy similarity ratio (0-100) between the cleaned titles.
    """
    # Clean the section title and the target word by removing non-alphabetic characters
    cleaned_section_title = re.sub(r'[^a-zA-Z]', '', current_section_title).lower()
    cleaned_target_word = re.sub(r'[^a-zA-Z]', '', cleaned_current_section_title).lower()

    similarity_score = fuzz.ratio(cleaned_section_title, cleaned_target_word)

    # Check if the current_section_number is in the global numbers_and_combinations list
    if current_section in numbers_and_combinations:
        print(f"Section Title Match: {similarity_score}% , Section Number Match: 100%")
    else:
        print(f"Current Section Number Not Found in numbers_and_combinations")

    return similarity_score

match_and_verify(current_section_title, current_section)

import re

def get_text_between_sections(current_section_title, next_section_title, cleaned_text):
    """
    Searches for the text between the current section title and the next section title
    in the cleaned text, and removes leading/trailing non-alphanumeric characters
    while keeping punctuation within the text.
    """
    # Define the start and end patterns
    start_pattern = re.sub(r'[^\w\s]', '', current_section_title).strip()
    end_pattern = re.sub(r'[^\w\s]', '', next_section_title).strip()

    # Search for the text between the patterns
    match = re.search(f"{start_pattern}(.*?){end_pattern}", cleaned_text, re.DOTALL)

    if match:
        text_between = match.group(1).strip()
        # Remove leading and trailing non-alphanumeric characters (including whitespace)
        global cleaned_text_between
        cleaned_text_between = re.sub(r'^\W+|\W+$', '', text_between)
        print(f"Text between '{start_pattern}' and '{end_pattern}': {cleaned_text_between}")
        return cleaned_text_between
    else:
        print(f"Could not find text between '{start_pattern}' and '{end_pattern}'.")
        return None

get_text_between_sections(current_section_title, next_section_title, cleaned_text)

def update_section_text(section_number, text_to_add, sections_list):
    """
    Finds a section by its number in the sections_list and updates its section_text.

    Args:
        section_number (str): The number of the section to update.
        text_to_add (str): The text to assign to the section_text.
        sections_list (list): The list of section dictionaries.
    """
    # Find the index of the section in sections_list
    section_index = -1
    for index, section in enumerate(sections_list):
        if section["section_number"] == section_number:
            section_index = index
            break

    # If the section is found, update its section_text
    if section_index != -1:
        sections_list[section_index]["section_text"] = text_to_add
        print(f"Updated section_text for section {section_number}")
    else:
        print(f"Section {section_number} not found in sections_list.")

update_section_text(current_section, cleaned_text_between, sections_list)

get_next_section("52", sections_list)
search_text_before_section_title(current_section_title, cleaned_text)
match_and_verify(current_section_title, current_section)
get_text_between_sections(current_section_title, next_section_title, cleaned_text)
update_section_text(current_section, cleaned_text_between, sections_list)
print("Loop Complete\n---------------------------")

get_next_section("52", sections_list)
search_text_before_section_title(current_section_title, cleaned_text)
match_and_verify(current_section_title, current_section)
get_text_between_sections(current_section_title, next_section_title, cleaned_text)
update_section_text(current_section, cleaned_text_between, sections_list)
print("Loop Complete\n---------------------------")

# Loop through all sections in the sections_list
for section in sections_list:
    # Get the current section number
    current_section_number = section["section_number"]

    # Call get_next_section for the current section number to print current and next section details
    get_next_section(current_section_number, sections_list)
    search_text_before_section_title(current_section_title, cleaned_text)
    match_and_verify(current_section_title, current_section)
    get_text_between_sections(current_section_title, next_section_title, cleaned_text)
    update_section_text(current_section, cleaned_text_between, sections_list)
    print("---------------------------") # Separator for clarity between iterations

for section in sections_list[51:53]:
    print(section)

#These are the codes that I can use for parsing the legal text and creating a KG

# Define a basic tool -- send a parameterized cypher query
# Say hello tool for your agent
def say_hello(person_name: str) -> dict:
    """Formats a welcome message to a named person.

    Args:
        person_name (str): the name of the person saying hello

    Returns:
        dict: A dictionary containing the results of the query.
              Includes a 'status' key ('success' or 'error').
              If 'success', includes a 'query_result' key with an array of result rows.
              If 'error', includes an 'error_message' key.
    """
    return graphdb.send_query("RETURN 'Hello to you, ' + $person_name AS reply",
    {
        "person_name": person_name
    })
    #Using $person also avoides the injection attack because it treats the injection as the person's name, as provided in example below

# Define the Cypher Agent
#designed to have a simple interaction with the user, getting user's name, using say_hello tool
hello_agent = Agent(
    name="hello_agent_v1",
    model=llm, # defined earlier in a variable
    description="Has friendly chats with a user.",
    instruction="""You are a helpful assistant, chatting with a user.
                Be polite and friendly, introducing yourself and asking who the user is.

                If the user provides their name, use the 'say_hello' tool to get a custom greeting.
                If the tool returns an error, inform the user politely.
                If the tool is successful, present the reply.
                """,
    tools=[say_hello], # Pass the function directly
)

print(f"Agent '{hello_agent.name}' created.")



from bs4 import BeautifulSoup
import requests
import json

# --- Load the webpage ---
url = "https://www.pakistani.org/pakistan/legislation/1860/actXLVof1860.html"  # replace with the real URL
response = requests.get(url)
html_content = response.text

soup = BeautifulSoup(html_content, "lxml")

# --- Extract notes from the bottom first ---
notes_dict = {}
for note_tag in soup.find_all(['div', 'p'], id=True):
    note_id = note_tag.get('id')
    note_text = note_tag.get_text(strip=True)
    if note_id and note_text:
        notes_dict[note_id] = note_text

# --- Extract content with link references ---
tags_to_extract = ['h1', 'h2', 'h3', 'h4', 'p']

content_list = []
for tag in soup.find_all(tags_to_extract):
    text = tag.get_text(" ", strip=True)
    linked_notes = []

    # Check for hyperlinks inside this tag
    for a_tag in tag.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('#'):  # local reference
            note_id = href.replace('#', '')
            if note_id in notes_dict:
                linked_notes.append(notes_dict[note_id])

    content_list.append({
        "tag": tag.name,
        "text": text,
        "linked_notes": linked_notes,
        "level": int(tag.name[1]) if tag.name.startswith('h') else 0
    })

# --- Save to JSON ---
with open("pakistan_penal_code_with_notes.json", "w", encoding="utf-8") as f:
    json.dump(content_list, f, ensure_ascii=False, indent=2)

print(f"Extracted {len(content_list)} items with linked notes.")

for content in content_list[:]:
    print(content)

import re

for tag in soup.find_all(re.compile("^h[1-6]$")):
    print(tag.name, tag.text)

import requests
from bs4 import BeautifulSoup

# Ensure the URL is defined from previous steps, or define it here if not
# url = "https://www.pakistani.org/pakistan/legislation/1860/actXLVof1860.html"

# Fetch the HTML content from the URL again
response = requests.get(url)
html_content = response.text

# Split the content into lines and process each line
lines = html_content.splitlines()

print("Processing entire HTML content line by line:")
for i, line in enumerate(lines, start=1):
    line = line.strip()
    if not line:
        continue  # Skip empty lines

    # Pass each line as a string to BeautifulSoup
    soup = BeautifulSoup(line, "html.parser")
    tags = [tag.name for tag in soup.find_all()]
    text = soup.get_text(strip=True)

    print(f"Line {i}:")
    print(f"  Tags: {tags if tags else 'No tags found'}")
    print(f"  Text: {text if text else 'No text found'}")
    print("-" * 100)

# Print the first 1000 lines from the 'lines' list
print("Printing the first 1000 lines:")
for i in range(min(1000, len(lines))):
    print(lines[i])
    #print("-" * 50) # Optional separator for readability

if len(lines) > 1000:
    print("\n... (Remaining lines not shown) ...")

print("Printing the last 4000 lines:")
start_index = max(0, len(lines) - 4000)
for i in range(start_index, len(lines)):
    print(lines[i])

if len(lines) > 4000:
    print("\n... (Remaining lines shown) ...")
else:
    print("\n(Fewer than 4000 lines were available.)")

from bs4 import BeautifulSoup
import json
import re

# Use the html_content variable directly instead of loading from a file
# with open("pakistan_penal_code.html", "r", encoding="utf-8") as f:
#     html = f.read()
html = html_content # Use the already loaded html_content

soup = BeautifulSoup(html, "html.parser")

# ---- STEP 1: Extract Code Title ----
code_title_tag = soup.find("h2")
code_title = code_title_tag.get_text(strip=True) if code_title_tag else "Pakistan Penal Code"

# ---- STEP 2: Extract Footnotes from the <div class="references"> ----
footnotes = {}
for div in soup.select("div.references div"):
    note_id = div.get("id")
    if note_id:
        note_text = div.get_text(" ", strip=True)
        footnotes[note_id] = note_text

# ---- STEP 3: Initialize JSON Structure ----
code_data = {
    "code_title": code_title,
    "chapters": []
}

# ---- STEP 4: Find All Chapter Blocks (robust detection) ----

# Helper: convert Roman -> int (works for typical chapter numerals)
def roman_to_int(s):
    s = s.upper()
    roman = {'I':1, 'V':5, 'X':10, 'L':50, 'C':100, 'D':500, 'M':1000}
    total = 0
    prev = 0
    for ch in reversed(s):
        val = roman.get(ch, 0)
        if val < prev:
            total -= val
        else:
            total += val
        prev = val
    return total

# Detect Chapters
chapters = soup.find_all("h4")
current_chapter = None

# regex: optional "CHAPTER ", capture roman numerals, optional title remainder
chapter_pattern = re.compile(r'^(?:CHAPTER\s*)?([IVXLCDM]+)(?:\s*[\-:—]\s*(.*))?$', re.IGNORECASE)

for h4 in chapters:
    text = h4.get_text(" ", strip=True)  # normalize whitespace

    # Try to match roman numeral (with or without the word CHAPTER)
    m = chapter_pattern.match(text)
    if m:
        roman = m.group(1).upper()
        # title might be in same h4 after a dash or colon
        maybe_title = (m.group(2) or "").strip()

        # if we already had an open chapter, save it
        if current_chapter:
            code_data["chapters"].append(current_chapter)

        # create new chapter using roman->int for stable numbering
        try:
            chap_num = roman_to_int(roman)
        except Exception:
            # fallback to length+1 if something odd happens
            chap_num = len(code_data["chapters"]) + 1

        current_chapter = {
            "chapter_number": chap_num,
            "chapter_roman": roman,
            "chapter_title": maybe_title if maybe_title else "",
            "sections": [],
            "chapter_footnotes": []
        }
        continue

    # If we reach here and we have a current chapter with empty title,
    # assume this h4 is the chapter title (common pattern: CHAPTER I then next h4 is title)
    if current_chapter and not current_chapter["chapter_title"]:
        # Some h4s use uppercase titles (INTRODUCTION), we accept the text
        current_chapter["chapter_title"] = text
        continue

# Append last seen chapter
if current_chapter:
    code_data["chapters"].append(current_chapter)

def extract_footnotes_with_spans(section_html, footnotes_dict):
    """
    Extracts footnotes and links them with their related text spans inside a section.

    Args:
        section_html (Tag): The BeautifulSoup Tag object for the section (<td> or similar).
        footnotes_dict (dict): Dictionary mapping footnote IDs to their note texts.

    Returns:
        tuple: (section_text, linked_footnotes)
            - section_text (str): The section text without embedded footnote markers.
            - linked_footnotes (list): A list of dicts with note_id, note_text, and related_text.
    """
    linked_footnotes = []
    section_text_segments = []
    current_note_id = None
    buffer = ""

    for element in section_html.contents:
        if getattr(element, "name", None) == "a" and element.has_attr("href"):
            # Encountered a footnote link
            ref_id = element["href"].replace("#", "")
            if current_note_id is None:
                # Start of a new footnote span
                if buffer.strip():
                    section_text_segments.append(buffer.strip())
                    buffer = ""
                current_note_id = ref_id
            else:
                # End of the current footnote span
                if buffer.strip():
                    linked_footnotes.append({
                        "note_id": current_note_id,
                        "note_text": footnotes_dict.get(current_note_id, ""),
                        "related_text": buffer.strip()
                    })
                    buffer = ""
                current_note_id = None
        else:
            # Regular text (NavigableString or tag)
            buffer += element.get_text(" ", strip=True) if hasattr(element, "get_text") else str(element)

    # Handle any remaining text outside footnotes
    if buffer.strip():
        section_text_segments.append(buffer.strip())

    # Combine non-footnoted text for the section_text
    section_text = " ".join(section_text_segments).strip()

    return section_text, linked_footnotes

# ---- STEP 5: Extract Sections ----
for chapter in code_data["chapters"]:
    chapter_div = soup.find("h4", string=re.compile(chapter["chapter_roman"], re.IGNORECASE))
    if chapter_div:
        # Look for the following <table> elements containing sections
        table = chapter_div.find_next("table")
        if not table:
            continue

        rows = table.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            if len(cols) == 2:
                # Extract section number and title
                section_number_tag = cols[0].get_text(strip=True)
                section_number_match = re.match(r"(\d+)", section_number_tag)
                if not section_number_match:
                    continue

                section_number = int(section_number_match.group(1))

                # Section title and text
                bold_texts = cols[1].find_all("b")
                if bold_texts:
                    section_title = bold_texts[0].get_text(strip=True)
                    section_text = cols[1].get_text(" ", strip=True).replace(section_title, "", 1).strip()
                else:
                    section_title = "Untitled Section"
                    section_text = cols[1].get_text(" ", strip=True)

                # ---- STEP 6: Link Footnotes ----
                section_text, linked_footnotes = extract_footnotes_with_spans(cols[1], footnotes)

# ---- STEP 7: Save to JSON ----
with open("pakistan_penal_code.json", "w", encoding="utf-8") as f:
    json.dump(code_data, f, ensure_ascii=False, indent=4)

print("✅ Extraction complete! JSON saved as pakistan_penal_code.json")

import json

with open("pakistan_penal_code.json", "r", encoding="utf-8") as f:
    json_output = json.load(f)

print("Extracted Chapters and Sections:")
print("--------------------------------")

if 'chapters' in json_output:
    for chapter in json_output['chapters']:
        chapter_number = chapter.get('chapter_number', 'N/A')
        chapter_title = chapter.get('chapter_title', 'N/A')
        print(f"\nChapter {chapter_number}: {chapter_title}")

        for section in chapter.get('sections', []):
            section_number = section.get('section_number', 'N/A')
            section_title = section.get('section_title', 'N/A')
            print(f"  Section {section_number}: {section_title}")

print("--------------------------------")

"""This is not working. I'll try a different approach. I'll try to save chapter numbers, section numbers and titles and then append in the json as we go. I need the following codes.

1. A code to append and update json, including the nestings that I have created.
2. A way to detect and match the html with the pdf so that if something is not found here, it can be found in the pdf and updated accordingly.
3. A way to update the json section by section. There are 650 sections. So if I start updating them one by one, I think I'll have to work for the first five chapters' sections and footnotes and then the entire document will fall into place.
4. The footnotes in the pdf are really long. So don't put them in the json unless necessary.

I have finally found a pattern. The footnotes, sections and chapters, all have different font size and style. I can make a regex based on that and then extract. This is more doable than html. Moreover, html has a lot of mistakes and is not at all complete. Therefore pdf is better.
"""

import fitz  # PyMuPDF
#----First pass: Print all pages----
doc = fitz.open("Pakistan Penal Code.pdf")

for page in doc:
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"]
                font = span["font"]
                size = span["size"]
                bold = "Bold" in font
                italic = "Italic" in font or "Oblique" in font
                all_caps = text.isupper() and bool(text.strip()) # Check for all caps, ignoring empty strings or strings with only whitespace

                style = []
                if bold:
                    style.append("bold")
                if italic:
                    style.append("italic")
                if all_caps:
                    style.append("all_caps")

                print(f"{' '.join(style):10} | {size:5.1f} | {text}")

# ---- Second Pass: Print pages 30–59 ----
print("\n\n=== Printing pages 30–59 ===\n")

for page_num in range(29, 59):  # page numbers are 0-indexed in PyMuPDF
    if page_num < len(doc):  # make sure the page exists
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"]
                    font = span["font"]
                    size = span["size"]
                    bold = "Bold" in font
                    italic = "Italic" in font or "Oblique" in font
                    all_caps = text.isupper() and bool(text.strip())

                    style = []
                    if bold:
                        style.append("bold")
                    if italic:
                        style.append("italic")
                    if all_caps:
                        style.append("all_caps")

                    print(f"{' '.join(style):10} | {size:5.1f} | {text}")

# Instead of print(), save data to json
page_data = []

for page_num in range(29, 59):
    if page_num < len(doc):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"]
                    font = span["font"]
                    size = span["size"]
                    bold = "Bold" in font
                    italic = "Italic" in font or "Oblique" in font
                    all_caps = text.isupper() and bool(text.strip())

                    style = []
                    if bold: style.append("bold")
                    if italic: style.append("italic")
                    if all_caps: style.append("all_caps")

                    page_data.append({
                        "page_number": page_num + 1,
                        "style": style,
                        "font_size": size,
                        "text": text
                    })

# Save as JSON for later use
import json
with open("penal_code_pages_30_59.json", "w", encoding="utf-8") as f:
    json.dump(page_data, f, ensure_ascii=False, indent=4)

import json
with open("penal_code_pages_30_59.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(data, indent=4, ensure_ascii=False))

#This code detects footnotes numbers in the text
import json

# Load your extracted JSON
with open("penal_code_pages_30_59.json", "r", encoding="utf-8") as f:
    data = json.load(f)

footnotes = []
note_counter = {}  # keeps count per page so numbering restarts: 30-1, 30-2, ...

# Iterate through entries
for i, entry in enumerate(data):
    font_size = entry.get("font_size", 0)
    text = entry.get("text", "").strip()
    page = entry.get("page_number")

    # Skip if current entry does not match criteria 1 & 2
    if not (8.0 <= font_size <= 9.5):
        continue

    if not text.isdigit():  # must be numeric
        continue

    # Check next entry exists
    if i + 1 >= len(data):
        continue

    next_entry = data[i + 1]
    next_font = next_entry.get("font_size", 0)

    # Confirm footnote via rule 3
    if next_font <= 9.5:
        continue

    # Track numbering per page
    if page not in note_counter:
        note_counter[page] = 1
    else:
        note_counter[page] += 1

    note_id = f"{page}-{text}"

    footnotes.append({
        "note_id": note_id,
        "page": page,
        "number": text
    })

# Save the results
with open("footnotes_in_text.json", "w", encoding="utf-8") as f:
    json.dump(footnotes, f, indent=4, ensure_ascii=False)

print("✅ Footnote extraction complete! Saved to footnotes_in_text.json")

with open("footnotes_in_text.json", "r", encoding="utf-8") as f:
    footnotes_text = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(footnotes_text, indent=4, ensure_ascii=False))

#This code detects duplicate note_ids on every page and merges them
import json

# Load the original extracted footnotes
with open("footnotes_in_text.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Step 1 — Group by page
pages = {}

for entry in data:
    page = entry["page"]
    if page not in pages:
        pages[page] = []
    pages[page].append(entry)

# Step 2 — Deduplicate per page
cleaned_pages = {}

for page, entries in pages.items():
    merged = {}

    for item in entries:
        nid = item["note_id"]

        if nid not in merged:
            # first occurrence → create base entry
            merged[nid] = item
        else:
            # duplicate → merge fields
            temp = merged[nid]

            for k, v in item.items():
                if k not in temp or not temp[k]:    # empty or missing value
                    temp[k] = v
                elif temp[k] != v:
                    # If there is differing text, combine them
                    if isinstance(temp[k], str) and isinstance(v, str):
                        if v not in temp[k]:
                            temp[k] += " " + v

            merged[nid] = temp

    # Save cleaned entries for this page
    cleaned_pages[page] = list(merged.values())

# Step 3 — Flatten results into a list
final_output = []
for page_entries in cleaned_pages.values():
    final_output.extend(page_entries)

# Step 4 — Save cleaned JSON
with open("footnotes_in_text.json", "w", encoding="utf-8") as f:
    json.dump(final_output, f, ensure_ascii=False, indent=4)

print("✅ Done! Cleaned footnotes saved as footnotes_in_text.json")

with open("footnotes_in_text.json", "r", encoding="utf-8") as f:
    footnotes_text = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(footnotes_text, indent=4, ensure_ascii=False))

#This code saves footnotes in the footnotes bar
import json

# Load your JSON data
with open("penal_code_pages_30_59.json", "r", encoding="utf-8") as f:
    data = json.load(f)

footnotes_list = []  # To store footnotes as a list of objects
current_page = None
current_note_id = None
current_note_text = ""
note_counter = 0

for entry in data:
    page = entry["page_number"]
    text = entry["text"].strip()
    font_size = entry["font_size"]

    # Reset note counter if we move to a new page
    if page != current_page:
        note_counter = 0
        current_page = page

    # Detect footnote: font size between 5.0 and 6.0 AND text is numeric
    if 5.0 <= font_size <= 6.0 and text.isdigit():
        # If there is a previous footnote, save its accumulated text
        if current_note_id:
            footnotes_list.append({
                "note_id": current_note_id,
                "note_text": current_note_text.strip()
            })

        # Reset for new footnote
        note_counter += 1
        current_note_id = f"{page}-{note_counter}"
        current_note_text = ""
    else:
        # Accumulate text if we have started a footnote
        if current_note_id:
            if current_note_text:
                current_note_text += " "
            current_note_text += text

# Save the last footnote
if current_note_id:
    footnotes_list.append({
        "note_id": current_note_id,
        "note_text": current_note_text.strip()
    })

# Write to JSON file
with open("footnotes_in_bar.json", "w", encoding="utf-8") as f:
    json.dump(footnotes_list, f, ensure_ascii=False, indent=4)

print("Footnotes extracted and saved to footnotes_in_bar.json")

with open("footnotes_in_bar.json", "r", encoding="utf-8") as f:
    footnotes_bar = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(footnotes_bar, indent=4, ensure_ascii=False))

#It missed 35-1, 38-7, 40 (1-8), 58 (1-2),  might be because the font size is greater

#Some footnotes have "Page 30 of 179" and following text in them, this code removes that text and updates the json
import json
import re

# Load the footnotes JSON
with open("footnotes_in_bar.json", "r", encoding="utf-8") as f:
    footnotes = json.load(f) # footnotes is a list of dictionaries

# Regex pattern to match "Page <number> of 179"
pattern = re.compile(r"Page \d+ of 179")

# Process each footnote
for footnote_entry in footnotes: # Iterate over the list of dictionaries
    note_text = footnote_entry["note_text"] # Access the 'note_text' key directly
    match = pattern.search(note_text)
    if match:
        # Truncate note_text at the start of the match
        footnote_entry["note_text"] = note_text[:match.start()].strip() # Update the 'note_text' within the dictionary

# Save the cleaned footnotes back to a new JSON file
with open("footnotes_in_bar.json", "w", encoding="utf-8") as f:
    json.dump(footnotes, f, ensure_ascii=False, indent=4)

print("Footnotes cleaned and saved to footnotes_in_bar.json")

#This code prompts user to enter the missing footnotes and complete the json file
import json

# Load JSON files
with open("footnotes_in_text.json", "r", encoding="utf-8") as f:
    footnotes_A_list = json.load(f)  # This is a list of dictionaries

with open("footnotes_in_bar.json", "r", encoding="utf-8") as f:
    footnotes_B_list = json.load(f)  # This is also a list of dictionaries

# Create a dictionary for footnotes_A to easily get all note_ids in order
footnotes_A_ordered_dict = {item['note_id']: item for item in footnotes_A_list}

# Create a dictionary for footnotes_B for efficient lookup and modification
footnotes_B_map = {item['note_id']: item['note_text'] for item in footnotes_B_list}

# Identify missing note_ids in B (compared to A)
missing_note_ids = [note_id for note_id in footnotes_A_ordered_dict.keys() if note_id not in footnotes_B_map]

if missing_note_ids:
    print(f"Missing note_ids in footnotes_in_bar.json: {missing_note_ids}\n")

    # Ask user to enter text for each missing note_id
    for note_id in missing_note_ids:
        text = input(f"Enter text for missing note_id {note_id}: ").strip()
        footnotes_B_map[note_id] = text
else:
    print("No missing note_ids found. B is complete.")

# Reconstruct footnotes_B_list based on the order of footnotes_A_list
# and include newly added notes or existing notes
updated_footnotes_B_list = []
for note_id in footnotes_A_ordered_dict.keys():
    if note_id in footnotes_B_map:
        updated_footnotes_B_list.append({"note_id": note_id, "note_text": footnotes_B_map[note_id]})
    else:
        # This case should ideally not be reached if all missing notes were prompted
        print(f"Warning: Note_id {note_id} from footnotes_A_list was not in footnotes_B_map after user input. Adding with empty text.")
        updated_footnotes_B_list.append({"note_id": note_id, "note_text": ""})

# Save the updated JSON
with open("footnotes_in_bar.json", "w", encoding="utf-8") as f:
    json.dump(updated_footnotes_B_list, f, ensure_ascii=False, indent=4)

print("footnotes_in_bar.json updated")

#This code merges the two files footnotes_in_text and footnotes_in_bar in a single file
import json

# Load JSON files
with open("footnotes_in_text.json", "r", encoding="utf-8") as f:
    footnotes_A = json.load(f) # List of dicts: [{'note_id': '30-1', 'page': 30, 'number': '1'}, ...]

with open("footnotes_in_bar.json", "r", encoding="utf-8") as f:
    footnotes_B = json.load(f) # List of dicts: [{'note_id': '30-1', 'note_text': '...'}, ...]

# Create a lookup map for footnotes_B for efficient access to note_text
footnotes_B_lookup = {item['note_id']: item['note_text'] for item in footnotes_B}

# Merge based on note_id
merged_footnotes = []

# Iterate through footnotes_A (the source of unique footnote IDs and their initial context)
for item_a in footnotes_A:
    note_id = item_a['note_id']

    # Get the note_text from the footnotes_B_lookup, or an empty string if not found
    note_text = footnotes_B_lookup.get(note_id, "")

    # Create the merged entry.
    # We'll take all fields from item_a and add 'text_in_bar'.
    merged_entry = item_a.copy() # Start with all info from footnotes_in_text
    merged_entry["note_text"] = note_text # Add the corresponding text

    merged_footnotes.append(merged_entry)

# Save the merged JSON
with open("footnotes_merged_dict.json", "w", encoding="utf-8") as f:
    json.dump(merged_footnotes, f, ensure_ascii=False, indent=4)

print("Merged footnotes saved as footnotes_merged_dict.json")

import json
with open("penal_code_pages_30_59.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(data, indent=4, ensure_ascii=False))

#This code is to extract all the possible entries which might contain section numbers and titles
import json

input_file = "penal_code_pages_30_59.json"
output_file = "filtered_sections.json"

def extract_filtered_entries():
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    filtered = []

    for entry in data:
        font_ok = entry.get("font_size") == 12.0
        bold_ok = "bold" in entry.get("style", [])
        text = entry.get("text", "")
        no_illustrations = "Illustrations" not in text and "Illustration" not in text
        no_allcaps = "all_caps" not in entry.get("style", [])

        if font_ok and bold_ok and no_illustrations and no_allcaps:
            filtered.append(entry)

    # Save output
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered, f, indent=4, ensure_ascii=False)

    print(f"Saved {len(filtered)} entries → {output_file}")

extract_filtered_entries()

import json
with open("filtered_sections.json", "r", encoding="utf-8") as f:
    filtered = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(filtered, indent=4, ensure_ascii=False))

#This code is to extract just the section numbers, to check if all the section numbers are extracted correctly
import json

input_file = "filtered_sections.json"
output_file = "sections.json"

def extract_sections_simple():
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    sections = []

    for entry in data:
        text = entry.get("text", "").strip()

        # Check if any character in text is a digit
        if any(char.isdigit() for char in text):
            sections.append({
                "section_number": text,
                "section_title": ""  # leaving empty for now
            })

    # Save to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=4)

    print(f"Extracted {len(sections)} section numbers → {output_file}")

extract_sections_simple()

#This code is to save section numbers and section titles in separate key-value pairs
import json

input_file = "filtered_sections.json"
output_file = "sections_other.json"

def extract_sections():
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    sections = []
    current_section_number = None
    current_title_parts = []

    for entry in data:
        text = entry.get("text", "").strip()

        # Detect if text contains any digit → section number
        if any(char.isdigit() for char in text):
            # Save previous section if exists
            if current_section_number is not None:
                sections.append({
                    "section_number": current_section_number,
                    "section_title": " ".join(current_title_parts).strip()
                })
            # Start new section
            current_section_number = text
            current_title_parts = []
        else:
            # If we already detected a section_number, append to title
            if current_section_number is not None:
                current_title_parts.append(text)

    # Save the last section
    if current_section_number is not None:
        sections.append({
            "section_number": current_section_number,
            "section_title": " ".join(current_title_parts).strip()
        })

    # Save to JSON
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=4)

    print(f"Extracted {len(sections)} sections → {output_file}")

extract_sections()

#This code is to remove full-stops and square brackets from section numbers
import json

input_file = "sections_other.json"
output_file = "sections_cleaned.json"

def clean_section_numbers():
    with open(input_file, "r", encoding="utf-8") as f:
        sections = json.load(f)

    for section in sections:
        original = section.get("section_number", "")
        # Remove full stops and square brackets
        cleaned = original.replace(".", "").replace("[", "").replace("]", "")
        section["section_number"] = cleaned.strip()

    # Save cleaned sections
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=4)

    print(f"Cleaned section numbers saved → {output_file}")

clean_section_numbers()

#This code is to transfer any text in section numbers to its title
import json

input_file = "sections_cleaned.json"
output_file = "sections_final.json"

def move_text_from_section_number():
    with open(input_file, "r", encoding="utf-8") as f:
        sections = json.load(f)

    updated_sections = []

    for entry in sections:
        sec_num = entry.get("section_number", "").strip()
        sec_title = entry.get("section_title", "").strip()

        # Separate the numeric+letter part at the start
        number_part = ""
        text_part = ""

        for i, char in enumerate(sec_num):
            if char.isdigit() or char.isalpha():
                number_part += char
            else:
                # Remaining part is text
                text_part = sec_num[i:].strip()
                break

        # Update section_number to only numeric+letter part
        entry["section_number"] = number_part

        # Prepend any text from section_number to section_title
        if text_part:
            entry["section_title"] = text_part + " " + sec_title if sec_title else text_part

        updated_sections.append(entry)

    # Save updated sections
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(updated_sections, f, ensure_ascii=False, indent=4)

    print(f"Updated sections saved → {output_file}")

move_text_from_section_number()

#This code checks if there are any empty section titles and so that we can manually fill them
import json

input_file = "sections_final.json"

def fill_empty_section_titles():
    # Load JSON
    with open(input_file, "r", encoding="utf-8") as f:
        sections = json.load(f)

    # Iterate and fill empty titles
    for entry in sections:
        if not entry.get("section_title", "").strip():
            print(f"Section number {entry.get('section_number')} has an empty title.")
            user_input = input("Please enter the section title: ").strip()
            entry["section_title"] = user_input

    # Save updated JSON
    with open(input_file, "w", encoding="utf-8") as f:
        json.dump(sections, f, ensure_ascii=False, indent=4)

    print(f"All empty section titles updated → {input_file}")

fill_empty_section_titles()

import json
with open("penal_code_pages_30_59.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Apply NLP / parsing / regex / LLM processing to this data only
print(json.dumps(data, indent=4, ensure_ascii=False))

import json
import re

input_file = "penal_code_pages_30_59.json"
filtered_file = "penal_code_pages_filtered.json"
output_file = "sections_extracted.json"

# Step 1: Load JSON and remove all entries with style "all_caps"
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Filter out entries with "all_caps" in style
filtered_data = [entry for entry in data if "all_caps" not in entry.get("style", [])]

# Save filtered data (optional)
with open(filtered_file, "w", encoding="utf-8") as f:
    json.dump(filtered_data, f, indent=4, ensure_ascii=False)

# Step 2: Extract sections from filtered data
sections = []
i = 0
n = len(filtered_data)

while i < n:
    entry = filtered_data[i]
    style = entry.get("style", [])

    # Header = font_size 12 + contains "bold"
    is_header_style = entry.get("font_size") == 12.0 and "bold" in style

    if is_header_style:
        section_header_text = entry.get("text", "").strip()
        section_text = ""
        j = i + 1

        # Section body starts if next entry has empty style or contains "italic"
        if j < n:
            next_entry_style = filtered_data[j].get("style", [])
            if next_entry_style == [] or "italic" in next_entry_style:

                while j < n:
                    next_entry = filtered_data[j]
                    next_style = next_entry.get("style", [])

                    # Stop at next header
                    next_is_header_style = (
                        next_entry.get("font_size") == 12.0 and "bold" in next_style
                    )
                    if next_is_header_style:
                        break

                    # Skip very small font entries (<7)
                    if next_entry.get("font_size", 0) >= 7:
                        section_text += next_entry.get("text", "")

                    j += 1

                # Remove "Page __ of 179"
                section_text = re.sub(r"Page\s+\d+\s+of\s+179", "", section_text)

                sections.append({
                    "section_header": section_header_text,
                    "section_text": section_text.strip()
                })

                i = j
                continue

    i += 1

# Save final sections
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(sections, f, indent=4, ensure_ascii=False)

print("Done: entries with 'all_caps' removed and sections extracted.")

#Mistakes
#1. If the title is in two lines, it doesn't capture that.

import re
import json

text = """This Act shall be called the 3[Pakistan]
Penal Code, and shall take effect 4* * * throughout 5[Pakistan]."""

results = []

# Pattern explanation:
#   (\d+)        → captures the footnote number (3, 4, 5)
#   (\[[^\]]*\]) → captures [Pakistan]
#   |            → OR
#   (\*[\s\*]*)  → captures * * * sequences
pattern = r"(\d+)(\[[^\]]*\]|\*[\s\*]*)"

for match in re.finditer(pattern, text):
    placeholder = match.group(1)
    span_text = match.group(2)

    # Compute offsets of the text *after* the number
    span_start = match.start(2)          # where the bracketed text or stars begin
    span_end = match.end(2)              # end of bracket or stars

    results.append({
        "placeholder": placeholder,
        "start_offset": span_start,
        "end_offset": span_end
    })

# Save results
with open("offsets.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print("Offsets saved to offsets.json")

#This is the schema of Pakistan Penal Code
 {
  "code_title": "Pakistan Penal Code"
  "code_sub_title": "Act No. XLV OF 1860"
  "Date": "6 Oct 1860"
  "Preamble": "Whereas ...."
  "chapters": {
      "chapter_number": 1,
      "chapter_roman": "I",
      "chapter_title": "INTRODUCTION",
      "sub_chapter_title": "Of Theft",
      "sections":
  }
}

#This is the proposed schema of the sections
{
  "section_number": "1",

  "section_title": "Title and extent of operation of the Code.",
  "section_text": "This Act shall be called the [Pakistan] Penal Code, and shall take effect throughout [Pakistan].",

  "references": {
      "section_title": [],
      "section_text": ["30-3", "30-4", "30-5"]
  },

  "inline_map": [
    {
      "placeholder": "3",
      "footnote_id": "30-3",
      "location": "section_text",
      "start_offset": 28,
      "end_offset": 38
    },
    {
      "placeholder": "4",
      "footnote_id": "30-4",
      "location": "section_text",
      "start_offset": 52,
      "end_offset": 55
    },
    {
      "placeholder": "5",
      "footnote_id": "30-5",
      "location": "section_text",
      "start_offset": 78,
      "end_offset": 88
    }
  ]
}

#If the footnote applies to the entire section
{
  "placeholder": "10",
  "footnote_id": "30-10",
  "location": "section_title+section_text",
  "start_offset": 0,
  "end_offset": "FULL"
}

import fitz  # PyMuPDF
import re
import json

doc = fitz.open("Pakistan Penal Code.pdf")

extracted_chapters = []
current_chapter_data = {}
state = "LOOKING_FOR_CHAPTER_NUMBER"

def is_chapter_heading_style(span):
    # Criteria: size 12.0, bold, all caps, and not just whitespace
    return span["size"] == 12.0 and "Bold" in span["font"] and span["text"].isupper() and bool(span["text"].strip())

# Start reading from page 30 (index 29)
for page_num in range(29, doc.page_count):
    page = doc.load_page(page_num)
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()

                if not text: # Skip empty spans
                    continue

                if is_chapter_heading_style(span):
                    chapter_match = re.match(r"CHAPTER\s+([IVXLCDM]+(?:[A-Z])?)", text, re.IGNORECASE)

                    if chapter_match:
                        # Found a chapter number. If there's an ongoing chapter, save it.
                        if current_chapter_data:
                            if "chapter_roman" in current_chapter_data and "chapter_title" == current_chapter_data["chapter_title"]:
                                # If title is still empty, try to populate from roman number line itself
                                possible_title_on_same_line = text[chapter_match.end():].strip()
                                if possible_title_on_same_line:
                                    current_chapter_data["chapter_title"] = possible_title_on_same_line
                            extracted_chapters.append(current_chapter_data)

                        # Start new chapter entry
                        chapter_roman_num = chapter_match.group(1).upper()
                        current_chapter_data = {
                            "chapter_number": len(extracted_chapters) + 1, # Sequential numbering
                            "chapter_roman": chapter_roman_num,
                            "chapter_title": "" # Initialize empty title
                        }

                        # Check for title on the same line after the Roman numeral
                        remaining_text = text[chapter_match.end():].strip()
                        if remaining_text:
                            current_chapter_data["chapter_title"] = remaining_text
                            state = "LOOKING_FOR_CHAPTER_NUMBER" # Title found, ready for next chapter number
                        else:
                            state = "LOOKING_FOR_CHAPTER_TITLE" # Expect title in next span/line

                    elif state == "LOOKING_FOR_CHAPTER_TITLE":
                        # This span is likely the title for the chapter number just found
                        current_chapter_data["chapter_title"] = text
                        state = "LOOKING_FOR_CHAPTER_NUMBER" # Title found, ready for next chapter number

# Append the last chapter if it exists and is not empty
if current_chapter_data:
    if "chapter_roman" in current_chapter_data:
        if not current_chapter_data["chapter_title"]:
            # Try to infer title if it's empty, e.g., from roman numeral line itself if it contains more text
            # This might require re-examining the last processed span if it was a chapter_match without immediate title
            pass # For now, leave empty if not explicitly found in a bold all-caps span
        extracted_chapters.append(current_chapter_data)

# Print the extracted chapters in JSON format
print(json.dumps(extracted_chapters, indent=2))

# Commented out IPython magic to ensure Python compatibility.
# #Install MinerU
# %%bash
# pip install --upgrade pip
# pip install uv
# uv pip install -U "mineru[core]"

#Create an input and output folder
import os

os.makedirs("input", exist_ok=True)
os.makedirs("output", exist_ok=True)

#Upload a pdf file
from google.colab import files

uploaded = files.upload()

for name in uploaded.keys():
    print("Uploaded:", name)

#Move your uploaded document into the input folder
import shutil

shutil.move(list(uploaded.keys())[0], "input/")

#Run MinerU parser
!mineru parse -p "input/Pakistan Penal Code.pdf" -o output/ --format json

#See the output file
import os

os.listdir("output")