#!/usr/bin/env python3 """ Fix Excel corruption issues caused by SharePoint/OneDrive metadata """ import os import shutil import zipfile import xml.etree.ElementTree as ET from pathlib import Path import tempfile import openpyxl def remove_sharepoint_metadata(excel_path, output_path=None): """ Remove SharePoint/OneDrive metadata from Excel file that causes corruption warnings Args: excel_path: Path to the Excel file to fix output_path: Optional path for the fixed file (if None, overwrites original) Returns: bool: True if successful, False otherwise """ if not output_path: output_path = excel_path print(f"Processing: {excel_path}") try: # Method 1: Use openpyxl to remove custom properties print("Method 1: Using openpyxl to clean custom properties...") wb = openpyxl.load_workbook(excel_path, keep_vba=True) # Remove custom document properties if hasattr(wb, 'custom_doc_props'): # Clear all custom properties wb.custom_doc_props.props.clear() print(" ✓ Cleared custom document properties") # Save to temporary file first temp_file = Path(output_path).with_suffix('.tmp.xlsx') wb.save(temp_file) wb.close() # Method 2: Direct ZIP manipulation to ensure complete removal print("Method 2: Direct ZIP manipulation for complete cleanup...") with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp: tmp_path = tmp.name with zipfile.ZipFile(temp_file, 'r') as zin: with zipfile.ZipFile(tmp_path, 'w', compression=zipfile.ZIP_DEFLATED) as zout: # Copy all files except custom.xml or create a clean one for item in zin.infolist(): if item.filename == 'docProps/custom.xml': # Create a clean custom.xml without SharePoint metadata clean_custom_xml = create_clean_custom_xml() zout.writestr(item, clean_custom_xml) print(" ✓ Replaced custom.xml with clean version") else: # Copy the file as-is zout.writestr(item, zin.read(item.filename)) # Replace original file with cleaned version shutil.move(tmp_path, output_path) # Clean up temporary file if temp_file.exists(): temp_file.unlink() print(f" ✓ Successfully cleaned: {output_path}") return True except Exception as e: print(f" ✗ Error cleaning file: {e}") return False def create_clean_custom_xml(): """ Create a clean custom.xml without SharePoint metadata """ # Create a minimal valid custom.xml xml_content = ''' ''' return xml_content.encode('utf-8') def clean_template_file(): """ Clean the template file to prevent future corruption """ template_dir = Path(__file__).parent / "template" template_files = list(template_dir.glob("*.xlsx")) if not template_files: print("No template files found") return False for template_file in template_files: print(f"\nCleaning template: {template_file.name}") # Create backup backup_path = template_file.with_suffix('.backup.xlsx') shutil.copy2(template_file, backup_path) print(f" ✓ Created backup: {backup_path.name}") # Clean the template if remove_sharepoint_metadata(str(template_file)): print(f" ✓ Template cleaned successfully") else: print(f" ✗ Failed to clean template") # Restore from backup shutil.copy2(backup_path, template_file) print(f" ✓ Restored from backup") return True def clean_all_output_files(): """ Clean all Excel files in the output directory """ output_dir = Path(__file__).parent / "output" excel_files = list(output_dir.glob("*.xlsx")) if not excel_files: print("No Excel files found in output directory") return False print(f"Found {len(excel_files)} Excel files to clean") for excel_file in excel_files: print(f"\nCleaning: {excel_file.name}") if remove_sharepoint_metadata(str(excel_file)): print(f" ✓ Cleaned successfully") else: print(f" ✗ Failed to clean") return True def verify_file_is_clean(excel_path): """ Verify that an Excel file is free from SharePoint metadata """ print(f"\nVerifying: {excel_path}") try: with zipfile.ZipFile(excel_path, 'r') as zf: if 'docProps/custom.xml' in zf.namelist(): content = zf.read('docProps/custom.xml') # Check for problematic metadata if b'ContentTypeId' in content: print(" ✗ Still contains SharePoint ContentTypeId") return False if b'MediaService' in content: print(" ✗ Still contains MediaService tags") return False print(" ✓ File is clean - no SharePoint metadata found") return True else: print(" ✓ File is clean - no custom.xml present") return True except Exception as e: print(f" ✗ Error verifying file: {e}") return False def main(): """Main function to clean Excel files""" print("=" * 60) print("Excel SharePoint Metadata Cleaner") print("=" * 60) # Step 1: Clean the template print("\nStep 1: Cleaning template file...") print("-" * 40) clean_template_file() # Step 2: Clean all output files print("\n\nStep 2: Cleaning output files...") print("-" * 40) clean_all_output_files() # Step 3: Verify cleaning print("\n\nStep 3: Verifying cleaned files...") print("-" * 40) # Verify template template_dir = Path(__file__).parent / "template" for template_file in template_dir.glob("*.xlsx"): if not template_file.name.endswith('.backup.xlsx'): verify_file_is_clean(str(template_file)) # Verify output files output_dir = Path(__file__).parent / "output" for excel_file in output_dir.glob("*.xlsx"): verify_file_is_clean(str(excel_file)) print("\n" + "=" * 60) print("Cleaning complete!") print("\nNOTE: The Excel files should now open without corruption warnings.") print("The SharePoint/OneDrive metadata has been removed.") print("\nFuture files generated from the cleaned template should not have this issue.") print("=" * 60) if __name__ == "__main__": main()