bussines_case_automation/fix_excel_corruption.py

#!/usr/bin/env python3
"""
Fix Excel corruption issues caused by SharePoint/OneDrive metadata
"""
import os
import shutil
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
import tempfile
import openpyxl

def remove_sharepoint_metadata(excel_path, output_path=None):
    """
    Remove SharePoint/OneDrive metadata from Excel file that causes corruption warnings

    Args:
        excel_path: Path to the Excel file to fix
        output_path: Optional path for the fixed file (if None, overwrites original)

    Returns:
        bool: True if successful, False otherwise
    """
    if not output_path:
        output_path = excel_path

    print(f"Processing: {excel_path}")

    try:
        # Method 1: Use openpyxl to remove custom properties
        print("Method 1: Using openpyxl to clean custom properties...")
        wb = openpyxl.load_workbook(excel_path, keep_vba=True)

        # Remove custom document properties
        if hasattr(wb, 'custom_doc_props'):
            # Clear all custom properties
            wb.custom_doc_props.props.clear()
            print("   ✓ Cleared custom document properties")

        # Save to temporary file first
        temp_file = Path(output_path).with_suffix('.tmp.xlsx')
        wb.save(temp_file)
        wb.close()

        # Method 2: Direct ZIP manipulation to ensure complete removal
        print("Method 2: Direct ZIP manipulation for complete cleanup...")
        with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp:
            tmp_path = tmp.name

        with zipfile.ZipFile(temp_file, 'r') as zin:
            with zipfile.ZipFile(tmp_path, 'w', compression=zipfile.ZIP_DEFLATED) as zout:
                # Copy all files except custom.xml or create a clean one
                for item in zin.infolist():
                    if item.filename == 'docProps/custom.xml':
                        # Create a clean custom.xml without SharePoint metadata
                        clean_custom_xml = create_clean_custom_xml()
                        zout.writestr(item, clean_custom_xml)
                        print("   ✓ Replaced custom.xml with clean version")
                    else:
                        # Copy the file as-is
                        zout.writestr(item, zin.read(item.filename))

        # Replace original file with cleaned version
        shutil.move(tmp_path, output_path)

        # Clean up temporary file
        if temp_file.exists():
            temp_file.unlink()

        print(f"   ✓ Successfully cleaned: {output_path}")
        return True

    except Exception as e:
        print(f"   ✗ Error cleaning file: {e}")
        return False

def create_clean_custom_xml():
    """
    Create a clean custom.xml without SharePoint metadata
    """
    # Create a minimal valid custom.xml
    xml_content = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties">
</Properties>'''
    return xml_content.encode('utf-8')

def clean_template_file():
    """
    Clean the template file to prevent future corruption
    """
    template_dir = Path(__file__).parent / "template"
    template_files = list(template_dir.glob("*.xlsx"))

    if not template_files:
        print("No template files found")
        return False

    for template_file in template_files:
        print(f"\nCleaning template: {template_file.name}")

        # Create backup
        backup_path = template_file.with_suffix('.backup.xlsx')
        shutil.copy2(template_file, backup_path)
        print(f"   ✓ Created backup: {backup_path.name}")

        # Clean the template
        if remove_sharepoint_metadata(str(template_file)):
            print(f"   ✓ Template cleaned successfully")
        else:
            print(f"   ✗ Failed to clean template")
            # Restore from backup
            shutil.copy2(backup_path, template_file)
            print(f"   ✓ Restored from backup")

    return True

def clean_all_output_files():
    """
    Clean all Excel files in the output directory
    """
    output_dir = Path(__file__).parent / "output"
    excel_files = list(output_dir.glob("*.xlsx"))

    if not excel_files:
        print("No Excel files found in output directory")
        return False

    print(f"Found {len(excel_files)} Excel files to clean")

    for excel_file in excel_files:
        print(f"\nCleaning: {excel_file.name}")
        if remove_sharepoint_metadata(str(excel_file)):
            print(f"   ✓ Cleaned successfully")
        else:
            print(f"   ✗ Failed to clean")

    return True

def verify_file_is_clean(excel_path):
    """
    Verify that an Excel file is free from SharePoint metadata
    """
    print(f"\nVerifying: {excel_path}")

    try:
        with zipfile.ZipFile(excel_path, 'r') as zf:
            if 'docProps/custom.xml' in zf.namelist():
                content = zf.read('docProps/custom.xml')

                # Check for problematic metadata
                if b'ContentTypeId' in content:
                    print("   ✗ Still contains SharePoint ContentTypeId")
                    return False
                if b'MediaService' in content:
                    print("   ✗ Still contains MediaService tags")
                    return False

                print("   ✓ File is clean - no SharePoint metadata found")
                return True
            else:
                print("   ✓ File is clean - no custom.xml present")
                return True

    except Exception as e:
        print(f"   ✗ Error verifying file: {e}")
        return False

def main():
    """Main function to clean Excel files"""
    print("=" * 60)
    print("Excel SharePoint Metadata Cleaner")
    print("=" * 60)

    # Step 1: Clean the template
    print("\nStep 1: Cleaning template file...")
    print("-" * 40)
    clean_template_file()

    # Step 2: Clean all output files
    print("\n\nStep 2: Cleaning output files...")
    print("-" * 40)
    clean_all_output_files()

    # Step 3: Verify cleaning
    print("\n\nStep 3: Verifying cleaned files...")
    print("-" * 40)

    # Verify template
    template_dir = Path(__file__).parent / "template"
    for template_file in template_dir.glob("*.xlsx"):
        if not template_file.name.endswith('.backup.xlsx'):
            verify_file_is_clean(str(template_file))

    # Verify output files
    output_dir = Path(__file__).parent / "output"
    for excel_file in output_dir.glob("*.xlsx"):
        verify_file_is_clean(str(excel_file))

    print("\n" + "=" * 60)
    print("Cleaning complete!")
    print("\nNOTE: The Excel files should now open without corruption warnings.")
    print("The SharePoint/OneDrive metadata has been removed.")
    print("\nFuture files generated from the cleaned template should not have this issue.")
    print("=" * 60)

if __name__ == "__main__":
    main()