Document parser upload

2026-05-20 18:59:16 +05:30 · 2026-05-20 18:59:16 +05:30 · 4db7e5ade2
parent f2788e025d
commit 4db7e5ade2
8 changed files with 831 additions and 251 deletions
--- a/addons_extensions/document_parser/init.py
+++ b/addons_extensions/document_parser/init.py
@ -0,0 +1 @@
 from . import models
--- a/addons_extensions/document_parser/manifest.py
+++ b/addons_extensions/document_parser/manifest.py
@ -0,0 +1,19 @@
 {
    "name": "Document Parser",
    "summary": "Reusable AI-assisted document text and data extraction",
    "version": "1.0.0",
    "category": "Tools",
    "author": "Pranay",
    "website": "https://www.ftprotech.com",
    "license": "LGPL-3",
    "depends": ["base"],
    "data": [
        "views/res_config_settings_views.xml",
    ],
    "installable": True,
    "application": False,
    "auto_install": False,
    "external_dependencies": {
        "python": ["requests"],
    },
 }
--- a/addons_extensions/document_parser/models/init.py
+++ b/addons_extensions/document_parser/models/init.py
@ -0,0 +1,2 @@
 from . import document_parser_service
 from . import res_config_settings
--- a/addons_extensions/document_parser/models/document_parser_service.py
+++ b/addons_extensions/document_parser/models/document_parser_service.py
@ -0,0 +1,444 @@
 import base64
 import json
 import logging
 import mimetypes
 import re
 from io import BytesIO
 import requests
 from odoo import _, api, models
 from odoo.exceptions import UserError
 try:
    import pytesseract
 except Exception:  # pragma: no cover - optional dependency
    pytesseract = None
 try:
    from PIL import Image
 except Exception:  # pragma: no cover - optional dependency
    Image = None
 try:
    from pdf2image import convert_from_bytes
 except Exception:  # pragma: no cover - optional dependency
    convert_from_bytes = None
 try:
    from pypdf import PdfReader
 except Exception:  # pragma: no cover - optional dependency
    PdfReader = None
 try:
    from docx import Document
 except Exception:  # pragma: no cover - optional dependency
    Document = None
 _logger = logging.getLogger(__name__)
 class DocumentParserService(models.AbstractModel):
    _name = "document.parser.service"
    _description = "Document Parser Service"
    TOGETHER_ENDPOINT = "https://api.together.xyz/v1/chat/completions"
    OPENROUTER_ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"
    TOGETHER_MODELS = [
        "Qwen/Qwen2.5-7B-Instruct-Turbo",
        "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    ]
    OPENROUTER_MODELS = [
        "qwen/qwen-2.5-7b-instruct",
        "qwen/qwen-2.5-7b-instruct:free",
        "deepseek/deepseek-chat:free",
    ]
    @api.model
    def parse_document(
        self,
        file_content,
        filename=None,
        required_fields=None,
        extra_instructions=None,
        json_schema=None,
    ):
        if not file_content:
            raise UserError(_("No document provided."))
        if not filename:
            raise UserError(_("Filename is required."))
        binary = self._decode_file_content(file_content)
        mimetype = self._detect_mimetype(binary, filename)
        text_content = self._extract_text(binary, mimetype)
        fields_spec = self._normalize_required_fields(required_fields or {})
        if not text_content.strip():
            return {
                "filename": filename,
                "mimetype": mimetype,
                "text": "",
                "result": {},
                "provider": False,
                "errors": [_("No text could be extracted from the document.")],
                "error": _("No text could be extracted from the document."),
            }
        schema_text = json_schema or self._build_json_schema_text(fields_spec)
        ai_result, provider_used, provider_errors = self._send_to_ai(
            text_content=text_content[:45000],
            schema_text=schema_text,
            extra_instructions=extra_instructions,
        )
        if not ai_result:
            ai_result = self._extract_with_heuristics(text_content, fields_spec)
        ai_result = ai_result or {}
        error_message = False
        if not ai_result and provider_errors:
            error_message = "; ".join(provider_errors[:3])
        return {
            "filename": filename,
            "mimetype": mimetype,
            "text": text_content,
            "result": ai_result,
            "provider": provider_used,
            "errors": provider_errors,
            "error": error_message,
        }
    @api.model
    def extract_requested_data(self, file_content, filename, required_fields, extra_instructions=None, json_schema=None):
        return self.parse_document(
            file_content=file_content,
            filename=filename,
            required_fields=required_fields,
            extra_instructions=extra_instructions,
            json_schema=json_schema,
        )["result"]
    def _decode_file_content(self, file_content):
        if isinstance(file_content, bytes):
            if file_content.startswith((b"%PDF", b"\xFF\xD8", b"\x89PNG", b"PK")):
                return file_content
            try:
                return base64.b64decode(file_content)
            except Exception:
                return file_content
        if isinstance(file_content, str):
            try:
                return base64.b64decode(file_content)
            except Exception as exc:
                raise UserError(_("Invalid base64 document.")) from exc
        raise UserError(_("Unsupported file format."))
    def _detect_mimetype(self, binary, filename):
        if filename:
            guessed = mimetypes.guess_type(filename)[0]
            if guessed:
                return guessed
        if binary.startswith(b"%PDF"):
            return "application/pdf"
        if binary.startswith(b"\xFF\xD8"):
            return "image/jpeg"
        if binary.startswith(b"\x89PNG"):
            return "image/png"
        if binary[:2] == b"PK":
            return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        return "application/octet-stream"
    def _extract_text(self, binary, mimetype):
        text_content = ""
        try:
            if mimetype == "application/pdf":
                text_content = self._extract_text_from_pdf(binary)
            elif mimetype in {"image/png", "image/jpeg", "image/jpg"}:
                text_content = self._extract_text_from_image(binary)
            elif mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                text_content = self._extract_text_from_docx(binary)
            elif mimetype.startswith("text/"):
                text_content = binary.decode("utf-8", errors="ignore")
        except Exception as exc:
            _logger.exception("Document text extraction failed: %s", exc)
        return (text_content or "").strip()
    def _extract_text_from_pdf(self, binary):
        extracted_parts = []
        if PdfReader:
            try:
                reader = PdfReader(BytesIO(binary))
                extracted_parts.extend(page.extract_text() or "" for page in reader.pages)
            except Exception as exc:
                _logger.warning("PdfReader extraction failed: %s", exc)
        text_content = "\n".join(part for part in extracted_parts if part).strip()
        if text_content:
            return text_content
        if convert_from_bytes and pytesseract:
            try:
                images = convert_from_bytes(binary, dpi=300)
                return "\n".join(
                    pytesseract.image_to_string(image)
                    for image in images
                ).strip()
            except Exception as exc:
                _logger.warning("PDF OCR extraction failed: %s", exc)
        return ""
    def _extract_text_from_image(self, binary):
        if not pytesseract or not Image:
            return ""
        try:
            image = Image.open(BytesIO(binary))
            return pytesseract.image_to_string(image).strip()
        except Exception as exc:
            _logger.warning("Image OCR extraction failed: %s", exc)
            return ""
    def _extract_text_from_docx(self, binary):
        if not Document:
            return ""
        try:
            document = Document(BytesIO(binary))
            return "\n".join(
                paragraph.text for paragraph in document.paragraphs if paragraph.text
            ).strip()
        except Exception as exc:
            _logger.warning("DOCX extraction failed: %s", exc)
            return ""
    def _send_to_ai(self, text_content, schema_text, extra_instructions=None):
        prompt = self._build_prompt(text_content, schema_text, extra_instructions)
        errors = []
        together_key = self._get_param("document_parser.together_ai_key") or self._get_param("document_parser.together_api_key")
        openrouter_key = self._get_param("document_parser.openrouter_ai_key") or self._get_param("document_parser.openrouter_api_key")
        if together_key:
            result, provider_errors = self._call_provider(
                provider_name="Together",
                endpoint=self.TOGETHER_ENDPOINT,
                headers={
                    "Authorization": f"Bearer {together_key}",
                    "Content-Type": "application/json",
                },
                models=self.TOGETHER_MODELS,
                prompt=prompt,
            )
            if result:
                return result, "together", errors
            errors.extend(provider_errors)
        else:
            errors.append(_("Together AI key is not configured."))
        if openrouter_key:
            result, provider_errors = self._call_provider(
                provider_name="OpenRouter",
                endpoint=self.OPENROUTER_ENDPOINT,
                headers={
                    "Authorization": f"Bearer {openrouter_key}",
                    "Content-Type": "application/json",
                    "HTTP-Referer": self._get_param("web.base.url") or "odoo.local",
                    "X-Title": "Document Parser",
                },
                models=self.OPENROUTER_MODELS,
                prompt=prompt,
            )
            if result:
                return result, "openrouter", errors
            errors.extend(provider_errors)
        else:
            errors.append(_("OpenRouter key is not configured."))
        return {}, False, errors
    def _build_prompt(self, text_content, schema_text, extra_instructions=None):
        return f"""
 You are a strict JSON generator.
 RULES:
 - Output ONLY valid raw JSON.
 - No explanation.
 - No markdown.
 - No backticks.
 - No extra text.
 - Follow schema strictly.
 - If a field is missing in text, return null.
 - Scan the entire document carefully before answering.
 - Extract ONLY what exists in text.
 - FOR ANY DATES CHANGE FORMAT TO %Y-%m-%d
 FIELD RULES:
 - If "skills" exists, extract only explicit technical skills written in the document.
 - Do NOT infer similar skills from role names, responsibilities, or projects.
 - Normalize names like "Expert Python" to "Python".
 - Exclude soft skills and business phrases.
 - Exclude responsibility-style phrases like Cross-Functional Collaboration, Cost Saving, Resource Utilization, Documentation, Reporting, and Team Handling.
 - Prefer concrete tools, methods, technologies, platforms, certifications, engineering/process methods, and domain techniques explicitly written in the resume.
 - If the resume explicitly mentions items like AutoCAD, Root Cause Analysis, Project Management, Manufacturing Processes, Lean, Six Sigma, or Quality Control, include them.
 - Remove duplicates and return each skill only once.
 - If "email" exists, return one valid normalized email.
 - If "name" exists, prefer the full name at the top and exclude titles, companies, and addresses.
 - If "phone" exists, return the most complete phone number found.
 - If "experience" exists, return only clearly supported numeric values.
 Schema:
 {schema_text}
 Instructions:
 {extra_instructions or "None"}
 Document:
 {text_content}
 """
    def _call_provider(self, provider_name, endpoint, headers, models, prompt):
        errors = []
        for model in models:
            payload = {
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "temperature": 0,
                "max_tokens": 1500,
            }
            try:
                response = requests.post(endpoint, headers=headers, json=payload, timeout=90)
                if response.status_code != 200:
                    message = _("%(provider)s model %(model)s failed with %(status)s: %(body)s") % {
                        "provider": provider_name,
                        "model": model,
                        "status": response.status_code,
                        "body": (response.text or "")[:300],
                    }
                    _logger.warning(message)
                    errors.append(message)
                    continue
                body = response.json()
                content = self._extract_message_content(body)
                parsed = self._safe_json_load(content)
                if parsed:
                    return parsed, errors
                message = _("%(provider)s model %(model)s returned invalid JSON.") % {
                    "provider": provider_name,
                    "model": model,
                }
                _logger.warning(message)
                errors.append(message)
            except Exception as exc:
                message = _("%(provider)s model %(model)s error: %(error)s") % {
                    "provider": provider_name,
                    "model": model,
                    "error": str(exc),
                }
                _logger.warning(message)
                errors.append(message)
        return {}, errors
    def _extract_message_content(self, response_body):
        try:
            content = response_body["choices"][0]["message"]["content"]
        except Exception:
            return ""
        if isinstance(content, list):
            parts = []
            for item in content:
                if isinstance(item, dict):
                    if item.get("type") == "text":
                        parts.append(item.get("text", ""))
                    elif item.get("text"):
                        parts.append(item.get("text"))
                else:
                    parts.append(str(item))
            return "\n".join(part for part in parts if part)
        if isinstance(content, dict):
            return content.get("text", "")
        return content or ""
    def _safe_json_load(self, content):
        if not content:
            return {}
        content = content.strip().replace("```json", "").replace("```", "").strip()
        try:
            return json.loads(content)
        except Exception:
            pass
        match = re.search(r"\{[\s\S]*\}", content)
        if match:
            try:
                return json.loads(match.group(0))
            except Exception:
                pass
        _logger.warning("JSON parse failed for provider response: %s", content[:500])
        return {}
    def _extract_with_heuristics(self, text_content, fields):
        result = {}
        email_match = re.search(r"([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,})", text_content or "", re.I)
        phone_match = re.search(r"(\+?\d[\d\-\s()]{7,}\d)", text_content or "")
        linkedin_match = re.search(r"(https?://(?:www\.)?linkedin\.com/[^\s]+)", text_content or "", re.I)
        name_guess = self._guess_name(text_content or "")
        skills_guess = self._guess_skills(text_content or "")
        for field_name, field_spec in fields.items():
            field_type = field_spec.get("type", "string")
            if field_name in {"email", "email_from"}:
                result[field_name] = email_match.group(1).lower() if email_match else None
            elif field_name in {"phone", "mobile", "partner_phone"}:
                result[field_name] = phone_match.group(1).strip() if phone_match else None
            elif field_name in {"linkedin_profile", "linkedin"}:
                result[field_name] = linkedin_match.group(1).strip() if linkedin_match else None
            elif field_name in {"name", "full_name", "partner_name"}:
                result[field_name] = name_guess
            elif field_name == "skills" and field_type == "list":
                result[field_name] = skills_guess
            else:
                result[field_name] = None
        return result
    def _guess_name(self, text_content):
        for line in [line.strip() for line in (text_content or "").splitlines() if line.strip()][:12]:
            cleaned = re.sub(r"[^A-Za-z .'-]", "", line).strip()
            if len(cleaned.split()) in {2, 3, 4} and not re.search(r"(resume|cv|email|phone|linkedin|skills|experience)", cleaned, re.I):
                return cleaned
        return None
    def _guess_skills(self, text_content):
        section = re.search(r"(skills|technical skills|core competencies)(.*?)(experience|education|projects|certifications|$)", text_content or "", re.I | re.S)
        if not section:
            return []
        parts = re.split(r"[,;\n|•]", section.group(2))
        cleaned = []
        for part in parts:
            value = re.sub(r"\s+", " ", part).strip(" -:\t\r\n")
            if value and 1 < len(value) < 50 and not re.search(r"^(skills?|experience|education)$", value, re.I):
                cleaned.append(value)
        return list(dict.fromkeys(cleaned[:25]))
    def _get_param(self, key):
        return self.env["ir.config_parameter"].sudo().get_param(key)
    def _normalize_required_fields(self, fields):
        if isinstance(fields, dict):
            normalized = {}
            for field_name, field_value in fields.items():
                if isinstance(field_value, dict):
                    normalized[field_name] = {
                        "type": field_value.get("type", "string"),
                        "description": field_value.get("description", field_name.replace("_", " ").title()),
                    }
                else:
                    normalized[field_name] = {
                        "type": "string",
                        "description": str(field_value or field_name.replace("_", " ").title()),
                    }
            return normalized
        if isinstance(fields, list):
            return {field_name: {"type": "string", "description": field_name.replace("_", " ").title()} for field_name in fields}
        return {}
    def _build_json_schema_text(self, fields):
        return json.dumps(fields, ensure_ascii=True)
--- a/addons_extensions/document_parser/models/res_config_settings.py
+++ b/addons_extensions/document_parser/models/res_config_settings.py
@ -0,0 +1,67 @@
 import requests
 from odoo import _, fields, models
 from odoo.exceptions import UserError
 class ResConfigSettings(models.TransientModel):
    _inherit = "res.config.settings"
    together_ai_key = fields.Char(
        string="Together AI Key",
        config_parameter="document_parser.together_ai_key",
    )
    openrouter_ai_key = fields.Char(
        string="OpenRouter AI Key",
        config_parameter="document_parser.openrouter_ai_key",
    )
    def action_test_together_ai_connection(self):
        self.ensure_one()
        if not self.together_ai_key:
            raise UserError(_("Please add the Together AI key first."))
        response = requests.get(
            "https://api.together.xyz/v1/models",
            headers={"Authorization": f"Bearer {self.together_ai_key}"},
            timeout=20,
        )
        if response.ok:
            return {
                "type": "ir.actions.client",
                "tag": "display_notification",
                "params": {
                    "title": _("Together AI Connection"),
                    "message": _("Connection successful."),
                    "type": "success",
                    "sticky": False,
                },
            }
        raise UserError(_("Together AI connection failed: %s") % (response.text or response.reason))
    def action_test_openrouter_ai_connection(self):
        self.ensure_one()
        if not self.openrouter_ai_key:
            raise UserError(_("Please add the OpenRouter key first."))
        response = requests.get(
            "https://openrouter.ai/api/v1/models",
            headers={
                "Authorization": f"Bearer {self.openrouter_ai_key}",
                "HTTP-Referer": self.env["ir.config_parameter"].sudo().get_param("web.base.url", ""),
                "X-Title": "Odoo Document Parser",
            },
            timeout=20,
        )
        if response.ok:
            return {
                "type": "ir.actions.client",
                "tag": "display_notification",
                "params": {
                    "title": _("OpenRouter Connection"),
                    "message": _("Connection successful."),
                    "type": "success",
                    "sticky": False,
                },
            }
        raise UserError(_("OpenRouter connection failed: %s") % (response.text or response.reason))
--- a/addons_extensions/document_parser/views/res_config_settings_views.xml
+++ b/addons_extensions/document_parser/views/res_config_settings_views.xml
@ -0,0 +1,39 @@
 <?xml version="1.0" encoding="utf-8"?>
 <odoo>
    <record id="res_config_settings_view_form_document_parser" model="ir.ui.view">
        <field name="name">res.config.settings.view.form.document.parser</field>
        <field name="model">res.config.settings</field>
        <field name="priority" eval="80"/>
        <field name="inherit_id" ref="base.res_config_settings_view_form"/>
        <field name="arch" type="xml">
            <xpath expr="//form" position="inside">
                <app string="Document Parser" name="document_parser" groups="base.group_system">
                    <block title="AI Providers" name="document_parser_ai_provider_block">
                        <setting string="Together AI Key"
                                 help="Primary provider used first for structured document extraction."
                                 id="document_parser_together_ai_key">
                            <div class="d-flex align-items-center gap-2">
                                <field name="together_ai_key" password="True" placeholder="together.ai API key"/>
                                <button name="action_test_together_ai_connection"
                                        string="Test Connection"
                                        type="object"
                                        class="btn btn-secondary"/>
                            </div>
                        </setting>
                        <setting string="OpenRouter AI Key"
                                 help="Fallback provider used when Together AI is unavailable or quota is exhausted."
                                 id="document_parser_openrouter_ai_key">
                            <div class="d-flex align-items-center gap-2">
                                <field name="openrouter_ai_key" password="True" placeholder="openrouter API key"/>
                                <button name="action_test_openrouter_ai_connection"
                                        string="Test Connection"
                                        type="object"
                                        class="btn btn-secondary"/>
                            </div>
                        </setting>
                    </block>
                </app>
            </xpath>
        </field>
    </record>
 </odoo>
--- a/addons_extensions/employee_it_declaration/models/it_tax_statement_wiz.py
+++ b/addons_extensions/employee_it_declaration/models/it_tax_statement_wiz.py
@ -653,8 +653,8 @@ class ITTaxStatementWizard(models.TransientModel):
                # self.professional_tax +
                # self.nps_employer_contribution
        )
-        taxable_old = max(0.0, annual_gross_salary + self.other_income + hp_income - old_deductions)
+        taxable_old = max(0.0, annual_gross_salary + self.other_income + hp_income - old_deductions - (-(self.professional_tax)))
-        taxable_new = max(0.0, annual_gross_salary + self.other_income + hp_income - new_deductions)
+        taxable_new = max(0.0, annual_gross_salary + self.other_income + hp_income - new_deductions - (-(self.professional_tax)))
        tax_result_old = self._compute_tax_old_regime(taxable_old, old_slab) if old_slab else False
        tax_result_new = self._compute_tax_new_regime(taxable_new, new_slab) if new_slab else False
        comparison_available = bool(tax_result_old and tax_result_new)
@ -750,6 +750,7 @@ class ITTaxStatementWizard(models.TransientModel):
        gross_salary_projected = values['gross_salary_projected']
        annual_net_salary = values['annual_net_salary']
        selected_standard_deduction = values['selected_standard_deduction']
        total_sec_16_deduction = values['selected_standard_deduction']+(-(self.professional_tax))
        old_deductions = values['old_deductions']
        new_deductions = values['new_deductions']
        hp_income = values['hp_income']
@ -845,6 +846,7 @@ class ITTaxStatementWizard(models.TransientModel):
            'deductions': {
                'professional_tax': self.professional_tax,
                'standard_deduction': selected_standard_deduction,
                'total_sec_16_deduction': total_sec_16_deduction,
                'nps_employer': self.nps_employer_contribution,
                'hra_exemption': self.hra_exemption,
                'interest_home_loan': self.interest_home_loan_self + self.interest_home_loan_letout,
@ -861,7 +863,7 @@ class ITTaxStatementWizard(models.TransientModel):
                'gross_salary': annual_gross_salary,
                'other_income': self.other_income,
                'house_property_income': hp_income,
-                'gross_total_income': (annual_gross_salary + self.other_income + hp_income) - selected_standard_deduction,
+                'gross_total_income': (annual_gross_salary + self.other_income + hp_income) - total_sec_16_deduction,
            },
            'taxable_income': {
--- a/addons_extensions/employee_it_declaration/report/it_tax_template.xml
+++ b/addons_extensions/employee_it_declaration/report/it_tax_template.xml
@ -30,6 +30,7 @@
                <t t-set="tax_employment" t-value="'{:,.0f}'.format(deductions.get('professional_tax', 0))"/>
                <t t-set="standard_deduction" t-value="'{:,.0f}'.format(deductions.get('standard_deduction', 0))"/>
                <t t-set="other_income" t-value="'{:,.0f}'.format(income_details.get('other_income', 0))"/>
                <t t-set="total_sec_16_deduction" t-value="'{:,.0f}'.format(deductions.get('total_sec_16_deduction', 0))"/>
                <t t-set="gross_total_income" t-value="'{:,.0f}'.format(income_details.get('gross_total_income', 0))"/>
                <t t-set="taxable_income" t-value="'{:,.0f}'.format(tax_computation.get('taxable_income', 0))"/>
                <t t-set="roundoff_taxable_income" t-value="'{:,.0f}'.format(tax_computation.get('roundoff_taxable_income', 0))"/>
@ -198,17 +199,22 @@
                        </tr>
                        <tr style="border-top: 2px solid #ddd; margin-bottom: 8px; margin-top: 10px;">
                            <td style="padding-left: 30px;">Less: Standard Deduction for Salaried Employees</td>
                            <td style="text-align: right;">
                                -<span t-esc="standard_deduction"/>
                            </td>
                            <td colspan="2"></td>
                            <td style="text-align: right;"
                                t-esc="standard_deduction"/>
                        </tr>
                        <tr style="border-top: 2px solid #ddd; margin-bottom: 8px; margin-top: 10px;">
                            <td style="padding-left: 30px;">Total</td>
                            <td colspan="2"></td>
                            <td style="text-align: right;"
-                                t-esc="tax_employment"/>
+                                t-esc="total_sec_16_deduction"/>
-                            <td colspan="1"></td>
+                        </tr>
-                            <td style="text-align: right;"
+                        <tr style="border-top: 2px solid #ddd; margin-bottom: 8px; margin-top: 10px;">
-                                t-esc="standard_deduction"/>
+                            <td><strong>Less:</strong> Marginal Relief</td>
                            <td colspan="3"></td>
                            <td style="text-align: right;">0</td>
                        </tr>
                        <tr style="border-top: 2px solid #ddd; font-weight: bold; margin-bottom: 8px; margin-top: 10px;">
		`@ -0,0 +1,2 @@`
							`from . import document_parser_service`
							`from . import res_config_settings`