Files
bussines_case_automation/fix_excel_corruption.py
andrei 0e2e1bddba Add xlsxwriter-based Excel generation scripts with openpyxl implementation
- Created create_excel_xlsxwriter.py and update_excel_xlsxwriter.py
- Uses openpyxl exclusively to preserve Excel formatting and formulas
- Updated server.js to use new xlsxwriter scripts for form submissions
- Maintains all original functionality while ensuring proper Excel file handling

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-22 13:53:06 +00:00

207 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Fix Excel corruption issues caused by SharePoint/OneDrive metadata
"""
import os
import shutil
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
import tempfile
import openpyxl
def remove_sharepoint_metadata(excel_path, output_path=None):
"""
Remove SharePoint/OneDrive metadata from Excel file that causes corruption warnings
Args:
excel_path: Path to the Excel file to fix
output_path: Optional path for the fixed file (if None, overwrites original)
Returns:
bool: True if successful, False otherwise
"""
if not output_path:
output_path = excel_path
print(f"Processing: {excel_path}")
try:
# Method 1: Use openpyxl to remove custom properties
print("Method 1: Using openpyxl to clean custom properties...")
wb = openpyxl.load_workbook(excel_path, keep_vba=True)
# Remove custom document properties
if hasattr(wb, 'custom_doc_props'):
# Clear all custom properties
wb.custom_doc_props.props.clear()
print(" ✓ Cleared custom document properties")
# Save to temporary file first
temp_file = Path(output_path).with_suffix('.tmp.xlsx')
wb.save(temp_file)
wb.close()
# Method 2: Direct ZIP manipulation to ensure complete removal
print("Method 2: Direct ZIP manipulation for complete cleanup...")
with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp:
tmp_path = tmp.name
with zipfile.ZipFile(temp_file, 'r') as zin:
with zipfile.ZipFile(tmp_path, 'w', compression=zipfile.ZIP_DEFLATED) as zout:
# Copy all files except custom.xml or create a clean one
for item in zin.infolist():
if item.filename == 'docProps/custom.xml':
# Create a clean custom.xml without SharePoint metadata
clean_custom_xml = create_clean_custom_xml()
zout.writestr(item, clean_custom_xml)
print(" ✓ Replaced custom.xml with clean version")
else:
# Copy the file as-is
zout.writestr(item, zin.read(item.filename))
# Replace original file with cleaned version
shutil.move(tmp_path, output_path)
# Clean up temporary file
if temp_file.exists():
temp_file.unlink()
print(f" ✓ Successfully cleaned: {output_path}")
return True
except Exception as e:
print(f" ✗ Error cleaning file: {e}")
return False
def create_clean_custom_xml():
"""
Create a clean custom.xml without SharePoint metadata
"""
# Create a minimal valid custom.xml
xml_content = '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties">
</Properties>'''
return xml_content.encode('utf-8')
def clean_template_file():
"""
Clean the template file to prevent future corruption
"""
template_dir = Path(__file__).parent / "template"
template_files = list(template_dir.glob("*.xlsx"))
if not template_files:
print("No template files found")
return False
for template_file in template_files:
print(f"\nCleaning template: {template_file.name}")
# Create backup
backup_path = template_file.with_suffix('.backup.xlsx')
shutil.copy2(template_file, backup_path)
print(f" ✓ Created backup: {backup_path.name}")
# Clean the template
if remove_sharepoint_metadata(str(template_file)):
print(f" ✓ Template cleaned successfully")
else:
print(f" ✗ Failed to clean template")
# Restore from backup
shutil.copy2(backup_path, template_file)
print(f" ✓ Restored from backup")
return True
def clean_all_output_files():
"""
Clean all Excel files in the output directory
"""
output_dir = Path(__file__).parent / "output"
excel_files = list(output_dir.glob("*.xlsx"))
if not excel_files:
print("No Excel files found in output directory")
return False
print(f"Found {len(excel_files)} Excel files to clean")
for excel_file in excel_files:
print(f"\nCleaning: {excel_file.name}")
if remove_sharepoint_metadata(str(excel_file)):
print(f" ✓ Cleaned successfully")
else:
print(f" ✗ Failed to clean")
return True
def verify_file_is_clean(excel_path):
"""
Verify that an Excel file is free from SharePoint metadata
"""
print(f"\nVerifying: {excel_path}")
try:
with zipfile.ZipFile(excel_path, 'r') as zf:
if 'docProps/custom.xml' in zf.namelist():
content = zf.read('docProps/custom.xml')
# Check for problematic metadata
if b'ContentTypeId' in content:
print(" ✗ Still contains SharePoint ContentTypeId")
return False
if b'MediaService' in content:
print(" ✗ Still contains MediaService tags")
return False
print(" ✓ File is clean - no SharePoint metadata found")
return True
else:
print(" ✓ File is clean - no custom.xml present")
return True
except Exception as e:
print(f" ✗ Error verifying file: {e}")
return False
def main():
"""Main function to clean Excel files"""
print("=" * 60)
print("Excel SharePoint Metadata Cleaner")
print("=" * 60)
# Step 1: Clean the template
print("\nStep 1: Cleaning template file...")
print("-" * 40)
clean_template_file()
# Step 2: Clean all output files
print("\n\nStep 2: Cleaning output files...")
print("-" * 40)
clean_all_output_files()
# Step 3: Verify cleaning
print("\n\nStep 3: Verifying cleaned files...")
print("-" * 40)
# Verify template
template_dir = Path(__file__).parent / "template"
for template_file in template_dir.glob("*.xlsx"):
if not template_file.name.endswith('.backup.xlsx'):
verify_file_is_clean(str(template_file))
# Verify output files
output_dir = Path(__file__).parent / "output"
for excel_file in output_dir.glob("*.xlsx"):
verify_file_is_clean(str(excel_file))
print("\n" + "=" * 60)
print("Cleaning complete!")
print("\nNOTE: The Excel files should now open without corruption warnings.")
print("The SharePoint/OneDrive metadata has been removed.")
print("\nFuture files generated from the cleaned template should not have this issue.")
print("=" * 60)
if __name__ == "__main__":
main()