#!/usr/bin/env python3 """ Utility to clean Excel files from SharePoint/OneDrive metadata that causes cross-platform compatibility issues. """ import os import sys import openpyxl from pathlib import Path import tempfile import shutil def clean_excel_file(input_path, output_path=None): """ Clean an Excel file from SharePoint/OneDrive metadata. Args: input_path (str): Path to the input Excel file output_path (str): Path for the cleaned file (optional) Returns: bool: True if successful, False otherwise """ if not os.path.exists(input_path): print(f"Error: File not found: {input_path}") return False if output_path is None: # Create cleaned version with _clean suffix path = Path(input_path) output_path = path.parent / f"{path.stem}_clean{path.suffix}" try: print(f"Loading Excel file: {input_path}") # Load workbook without VBA to avoid macro issues wb = openpyxl.load_workbook(input_path, data_only=False, keep_vba=False) # Clean metadata print("Cleaning metadata...") # Clear custom document properties if hasattr(wb, 'custom_doc_props') and wb.custom_doc_props: wb.custom_doc_props.props.clear() print(" ✓ Cleared custom document properties") # Clear custom XML if hasattr(wb, 'custom_xml'): wb.custom_xml = [] print(" ✓ Cleared custom XML") # Clean core properties if wb.properties: # Keep only essential properties wb.properties.creator = "Excel Generator" wb.properties.lastModifiedBy = "Excel Generator" wb.properties.keywords = "" wb.properties.category = "" wb.properties.contentStatus = "" wb.properties.subject = "" wb.properties.description = "" print(" ✓ Cleaned core properties") # Create temporary file for double-save cleaning with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as tmp: tmp_path = tmp.name print("Saving cleaned file...") # First save to temp file wb.save(tmp_path) wb.close() # Re-open and save again to ensure clean structure print("Re-processing for maximum cleanliness...") wb_clean = openpyxl.load_workbook(tmp_path, data_only=False) # Additional cleaning on the re-opened file if hasattr(wb_clean, 'custom_doc_props') and wb_clean.custom_doc_props: wb_clean.custom_doc_props.props.clear() if hasattr(wb_clean, 'custom_xml'): wb_clean.custom_xml = [] # Save final clean version wb_clean.save(output_path) wb_clean.close() # Clean up temporary file os.unlink(tmp_path) print(f"✓ Cleaned Excel file saved to: {output_path}") # Compare file sizes input_size = os.path.getsize(input_path) output_size = os.path.getsize(output_path) print(f"File size: {input_size:,} → {output_size:,} bytes") if input_size > output_size: print(f"Reduced by {input_size - output_size:,} bytes ({((input_size - output_size) / input_size * 100):.1f}%)") return True except Exception as e: print(f"Error cleaning Excel file: {e}") import traceback traceback.print_exc() return False def clean_template(): """ Clean the template file in the template directory. """ script_dir = os.path.dirname(os.path.abspath(__file__)) template_dir = os.path.join(script_dir, 'template') # Look for template files possible_templates = [ 'Footprints AI for {store_name} - Retail Media Business Case Calculations.xlsx', 'Footprints AI for store_name - Retail Media Business Case Calculations.xlsx' ] template_path = None for template_name in possible_templates: full_path = os.path.join(template_dir, template_name) if os.path.exists(full_path): template_path = full_path print(f"Found template: {template_name}") break if not template_path: print(f"Error: No template found in {template_dir}") return False # Create cleaned template cleaned_path = os.path.join(template_dir, "cleaned_template.xlsx") return clean_excel_file(template_path, cleaned_path) if __name__ == "__main__": if len(sys.argv) > 1: # Clean specific file input_file = sys.argv[1] output_file = sys.argv[2] if len(sys.argv) > 2 else None if clean_excel_file(input_file, output_file): print("✓ File cleaned successfully") else: print("✗ Failed to clean file") sys.exit(1) else: # Clean template if clean_template(): print("✓ Template cleaned successfully") else: print("✗ Failed to clean template") sys.exit(1)