from flask import Flask, request, jsonify
import pdfplumber
import fitz  # PyMuPDF
import base64
import os

import json
import re
import unicodedata


app = Flask(__name__)


@app.route('/')
def home():
    return "PDF Extractor API is running!"



# ✅ নতুন POST route
@app.route('/upload-pdf', methods=['POST'])
def upload_pdf():

    # file আছে কিনা চেক
    if 'file' not in request.files:
        return jsonify({
            "status": "error",
            "message": "No file part"
        }), 400

    file = request.files['file']

    # filename empty কিনা
    if file.filename == '':
        return jsonify({
            "status": "error",
            "message": "No selected file"
        }), 400

    # PDF কিনা চেক
    if not file.filename.lower().endswith('.pdf'):
        return jsonify({
            "status": "error",
            "message": "Only PDF file allowed"
        }), 400

    # uploads ফোল্ডার না থাকলে তৈরি
    upload_folder = "uploads"
    os.makedirs(upload_folder, exist_ok=True)

    file_path = os.path.join(upload_folder, file.filename)
    file.save(file_path)
    result = parse_nid_pdf(file_path)

    return jsonify({
        "status": "success",
        "message": "pdf আপলোড হয়েছে",
        "filename": file.filename,
        "data": result
    })













def sanitize_bengali(text: str) -> str:
    # Bengali combining marks / vowel signs
    VOWEL_SIGNS = set(chr(c) for c in range(0x09BE, 0x09CD + 1))  # া..্ (includes virama)
    # add a few common marks
    MARKS = set(['\u0981', '\u0982', '\u0983', '\u09BC', '\u09D7'])  # ঁ ং ঃ ় ৗ

    out = []
    i = 0
    while i < len(text):
        ch = text[i]

        # 1) Collapse repeated visarga "ঃ"
        if ch == '\u0983':  # ঃ
            out.append(ch)
            # skip any repeats of ঃ (and also skip vowel signs right after it)
            j = i + 1
            while j < len(text) and (text[j] == '\u0983' or text[j] in VOWEL_SIGNS):
                j += 1
            i = j
            continue

        # 2) If current is a vowel sign and previous emitted is also a vowel sign, drop current
        if ch in VOWEL_SIGNS:
            if out and out[-1] in VOWEL_SIGNS:
                i += 1
                continue

        # 3) (Optional) if current is a mark and previous is same mark, drop current
        if ch in MARKS:
            if out and out[-1] == ch:
                i += 1
                continue

        out.append(ch)
        i += 1

    return ''.join(out)


def clean_text(text):
    if not text:
        return ""

    # Normalize (NFC helps compose things like 'ে'+'া' -> 'ো' when possible)
    text = unicodedata.normalize('NFC', text)

    # Normalize whitespace early
    text = re.sub(r'\s+', ' ', text)

    # ✅ Bangla specific cleanup (fixes: "মোঃাঃ", "সোালাইমান", etc.)
    text = sanitize_bengali(text)

    return text.strip()
















def extract_field(text, pattern, group=1):
    match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    return clean_text(match.group(group)) if match and match.group(group) else ""

def extract_bounded(text, start_pattern, end_pattern):
    pattern = re.compile(start_pattern + r'(.*?)' + end_pattern, re.DOTALL | re.IGNORECASE)
    match = pattern.search(text)
    return clean_text(match.group(1)) if match else ""



ADDR_LABELS = [
    ("division", r"Division"),
    ("district", r"District"),
    ("rmo", r"RMO"),
    ("city_corporation_or_municipality", r"City\s+Corporation\s+Or\s+Municipality"),
    ("upozila", r"Upozila"),
    ("union_ward", r"Union/Ward"),
    ("mouza_moholla", r"Mouza/Moholla"),
    ("additional_mouza_moholla", r"Additional\s+Mouza/Moholla"),
    ("ward_for_union_porishod", r"Ward\s+For\s+Union\s+Porishod"),
    ("village_road", r"Village/Road"),
    ("additional_village_road", r"Additional\s+Village/Road"),
    ("home_holding_no", r"Home/Holding\s+No"),
    ("post_office", r"Post\s+Office"),
    ("postal_code", r"Postal\s+Code"),
    ("region", r"Region"),
]

# value যদি কোনো label দিয়ে শুরু হয় তাহলে সেটাকে "খালি" ধরা হবে
LABEL_STARTERS = [
    "Division", "District", "RMO", "City Corporation Or Municipality", "Upozila", "Union/Ward",
    "Mouza/Moholla", "Additional Mouza/Moholla", "Ward For Union Porishod",
    "Village/Road", "Additional Village/Road", "Home/Holding No", "Post Office",
    "Postal Code", "Region"
]

def _starts_with_any_label(s: str) -> bool:
    s = s.strip()
    if not s:
        return False
    low = s.lower()
    return any(low.startswith(lab.lower()) for lab in LABEL_STARTERS)

def parse_addr_block(block):
    if not block:
        return {}

    text = clean_text(block)  # তোমার clean_text() আগেই আছে

    # সব label-এর position বের করি
    hits = []
    for key, pat in ADDR_LABELS:
        m = re.search(rf"\b{pat}\b", text, flags=re.IGNORECASE)
        if m:
            hits.append((m.start(), m.end(), key))

    if not hits:
        return {}

    hits.sort(key=lambda x: x[0])

    addr = {}
    for i, (start, end, key) in enumerate(hits):
        next_start = hits[i + 1][0] if i + 1 < len(hits) else len(text)
        raw_val = text[end:next_start].strip()
        val = clean_text(raw_val)

        # value খালি হলে বা label ঢুকে গেলে খালি করে দাও
        if not val or _starts_with_any_label(val):
            val = ""

        # ward_for_union_porishod যদি সংখ্যাই না হয়, খালি
        if key == "ward_for_union_porishod" and val and not re.fullmatch(r"\d+", val):
            val = ""

        addr[key] = val

    # যেগুলো না-ও পাওয়া যেতে পারে, সেগুলা default খালি রাখি (optional)
    for key, _ in ADDR_LABELS:
        addr.setdefault(key, "")

    return addr




























def parse_nid_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text("text") + "\n"
    doc.close()

    full_text = clean_text(full_text)

    data = {}

    # Basic info
    data["national_id"] = extract_field(full_text, r'National ID\s+([0-9]+)')
    data["pin"] = extract_field(full_text, r'Pin\s+([0-9]+)')
    data["status"] = extract_field(full_text, r'Status\s+(\w+)')
    data["afis_status"] = extract_field(full_text, r'Afis Status\s+([\w_]+)')
    data["lock_flag"] = extract_field(full_text, r'Lock Flag\s+(\w)')
    data["voter_no"] = extract_field(full_text, r'Voter No\s+([0-9]+)')
    data["form_no"] = extract_field(full_text, r'Form No\s+([\w]+)')
    data["sl_no"] = extract_field(full_text, r'Sl No\s+([0-9]+)')
    data["tag"] = extract_field(full_text, r'Tag\s+([\w_]+)')

    # Name
    data["name"] = {
        "bangla": extract_field(full_text, r'Name\(Bangla\)\s+(.*?)\s+Name\(English\)', 1),
        "english": extract_field(full_text, r'Name\(English\)\s+(.*?)\s+Date of Birth', 1)
    }

    data["date_of_birth"] = extract_field(full_text, r'Date of Birth\s+([0-9-]+)')
    data["birth_place"] = extract_field(full_text, r'Birth Place\s+(.*?)\s+Birth Other')
    data["birth_registration_no"] = extract_field(full_text, r'Birth Registration No\s+([0-9]*)')

    data["father_name"] = extract_field(full_text, r'Father Name\s+(.*?)\s+Mother Name')
    data["mother_name"] = extract_field(full_text, r'Mother Name\s+(.*?)\s+(1st )?Spouse Name')

    # Spouse Name - main fix
    spouse_block = extract_bounded(full_text, r'(1st )?Spouse Name', r'Gender')
    data["spouse_name"] = clean_text(spouse_block)

    data["gender"] = extract_field(full_text, r'Gender\s+(\w+)')
    data["marital_status"] = extract_field(full_text, r'Marital\s+(\w+)')
    data["occupation"] = extract_field(full_text, r'Occupation\s+(.*?)\s+Disability')

    # Addresses
    present_block = extract_bounded(full_text, r'Present Address', r'(Permanent Address|Foreign Address|Education|Voter Documents)')
    permanent_block = extract_bounded(full_text, r'Permanent Address', r'(Foreign Address|Education|Voter Documents|Email)')
    foreign_block = extract_bounded(full_text, r'Foreign Address', r'(Education|Voter Documents|Email)')

    data["present_address"] = parse_addr_block(present_block)
    data["permanent_address"] = parse_addr_block(permanent_block)
    data["foreign_address"] = parse_addr_block(foreign_block or "")

    # Blood Group - fix for empty
    blood_block = extract_bounded(full_text, r'Blood Group', r'TIN|Driving|Passport|Laptop ID')
    blood = clean_text(blood_block)
    if re.match(r'^[ABOAB+-]+$', blood):
        data["blood_group"] = blood
    else:
        data["blood_group"] = ""

    data["education"] = extract_field(full_text, r'Education\s+(.*?)\s+(Education Other|Education Sub|Identification)')
    data["laptop_id"] = extract_field(full_text, r'Laptop ID\s+([\w_/]+)')
    data["nid_father"] = extract_field(full_text, r'NID Father\s+([0-9]*)')
    data["religion"] = extract_field(full_text, r'Religion\s+(\w+)')
    data["voter_area"] = extract_field(full_text, r'Voter Area\s+(.*?)\s+Voter At')
    data["voter_at"] = extract_field(full_text, r'Voter At\s+(\w+)')

    # Voter documents
    data["voter_documents"] = []
    known_docs = ["DATA ENTRY PROOF COPY", "OTHER", "VOTER FORM PAGE ONE", "VOTER FORM PAGE TWO"]
    for d in known_docs:
        if d in full_text:
            data["voter_documents"].append(d)

    # Default empty
    empty_fields = ["disability", "disability_other", "tin", "driving", "passport", "phone", "mobile", "email", "nid_mother", "nid_spouse"]
    for f in empty_fields:
        data[f] = ""

    data["no_finger"] = extract_field(full_text, r'No Finger\s+([0-9])')
    data["no_finger_print"] = extract_field(full_text, r'No Finger Print\s+([0-9])')

    return data





application = app




