bussines_case_automation/diagnose_excel_issue.py

#!/usr/bin/env python3
import os
import zipfile
import xml.etree.ElementTree as ET
import openpyxl
from openpyxl.xml.functions import fromstring, tostring
from pathlib import Path

def diagnose_excel_file(file_path):
    """Diagnose Excel file for corruption issues"""
    print(f"Diagnosing: {file_path}")
    print("=" * 50)

    # 1. Check if file exists
    if not os.path.exists(file_path):
        print(f"ERROR: File not found: {file_path}")
        return

    # 2. Try to open with openpyxl
    print("\n1. Testing openpyxl compatibility:")
    try:
        wb = openpyxl.load_workbook(file_path, read_only=False, keep_vba=True, data_only=False)
        print(f"   ✓ Successfully loaded with openpyxl")
        print(f"   - Sheets: {wb.sheetnames}")

        # Check for custom properties
        if hasattr(wb, 'custom_doc_props'):
            print(f"   - Custom properties: {wb.custom_doc_props}")

        wb.close()
    except Exception as e:
        print(f"   ✗ Failed to load with openpyxl: {e}")

    # 3. Analyze ZIP structure
    print("\n2. Analyzing ZIP/XML structure:")
    try:
        with zipfile.ZipFile(file_path, 'r') as zf:
            # Check for custom XML
            custom_xml_files = [f for f in zf.namelist() if 'customXml' in f or 'custom' in f.lower()]
            if custom_xml_files:
                print(f"   ! Found custom XML files: {custom_xml_files}")

                for custom_file in custom_xml_files:
                    try:
                        content = zf.read(custom_file)
                        print(f"\n   Content of {custom_file}:")
                        print(f"   {content[:500].decode('utf-8', errors='ignore')}")
                    except Exception as e:
                        print(f"   Error reading {custom_file}: {e}")

            # Check for tables
            table_files = [f for f in zf.namelist() if 'xl/tables/' in f]
            if table_files:
                print(f"   - Found table files: {table_files}")
                for table_file in table_files:
                    content = zf.read(table_file)
                    # Check if XML declaration is present
                    if not content.startswith(b'<?xml'):
                        print(f"   ! WARNING: {table_file} missing XML declaration")

            # Check workbook.xml for issues
            if 'xl/workbook.xml' in zf.namelist():
                workbook_content = zf.read('xl/workbook.xml')
                # Parse and check for issues
                try:
                    root = ET.fromstring(workbook_content)
                    # Check for external references
                    ext_refs = root.findall('.//{http://schemas.openxmlformats.org/spreadsheetml/2006/main}externalReference')
                    if ext_refs:
                        print(f"   ! Found {len(ext_refs)} external references")
                except Exception as e:
                    print(f"   ! Error parsing workbook.xml: {e}")

    except Exception as e:
        print(f"   ✗ Failed to analyze ZIP structure: {e}")

    # 4. Check for SharePoint/OneDrive metadata
    print("\n3. Checking for SharePoint/OneDrive metadata:")
    try:
        with zipfile.ZipFile(file_path, 'r') as zf:
            if 'docProps/custom.xml' in zf.namelist():
                content = zf.read('docProps/custom.xml')
                if b'ContentTypeId' in content:
                    print("   ! Found SharePoint ContentTypeId in custom.xml")
                    print("   ! This file contains SharePoint metadata that may cause issues")
                if b'MediaService' in content:
                    print("   ! Found MediaService tags in custom.xml")
    except Exception as e:
        print(f"   ✗ Error checking metadata: {e}")

    # 5. Compare with template
    print("\n4. Comparing with template:")
    template_path = Path(file_path).parent.parent / "template" / "Footprints AI for {store_name} - Retail Media Business Case Calculations.xlsx"
    if template_path.exists():
        try:
            with zipfile.ZipFile(template_path, 'r') as tf:
                with zipfile.ZipFile(file_path, 'r') as gf:
                    template_files = set(tf.namelist())
                    generated_files = set(gf.namelist())

                    # Files in generated but not in template
                    extra_files = generated_files - template_files
                    if extra_files:
                        print(f"   ! Extra files in generated: {extra_files}")

                    # Files in template but not in generated
                    missing_files = template_files - generated_files
                    if missing_files:
                        print(f"   ! Missing files in generated: {missing_files}")
        except Exception as e:
            print(f"   ✗ Error comparing with template: {e}")
    else:
        print(f"   - Template not found at {template_path}")

    print("\n" + "=" * 50)
    print("DIAGNOSIS SUMMARY:")
    print("The error 'This file has custom XML elements that are no longer supported'")
    print("is likely caused by SharePoint/OneDrive metadata in the custom.xml file.")
    print("\nThe ContentTypeId property suggests this file was previously stored in")
    print("SharePoint/OneDrive, which added custom metadata that Excel doesn't support")
    print("in certain contexts.")

# Test with the latest file
if __name__ == "__main__":
    output_dir = Path(__file__).parent / "output"
    test_file = output_dir / "Footprints AI for Test14 - Retail Media Business Case Calculations 2025-2028.xlsx"

    if test_file.exists():
        diagnose_excel_file(str(test_file))
    else:
        print(f"Test file not found: {test_file}")
        # Try to find any Excel file in output
        excel_files = list(output_dir.glob("*.xlsx"))
        if excel_files:
            print(f"\nFound {len(excel_files)} Excel files in output directory.")
            print("Diagnosing the most recent one...")
            latest_file = max(excel_files, key=os.path.getmtime)
            diagnose_excel_file(str(latest_file))