#!/usr/bin/env python3

import re
import os
from pathlib import Path
from datetime import datetime
import gzip
import io

# Configuration
base_url = 'http://milahu.duckdns.org/'
max_urls_per_file = 50000  # Sitemap protocol limit
max_file_size = 50 * 1024 * 1024  # 50MB in bytes
input_file_list = [
    'cas.files.txt.gz',
]
sitemap_index_file = 'sitemap.xml'
output_prefix = 'sitemap'
output_extension = '.xml.gz'

# XML escaping mapping
escape_map = {
    '&': '&amp;',
    '"': '&quot;',
    '>': '&gt;',
    '<': '&lt;',
    "'": '&apos;',
    ' ': '%20',
    '\t': '%09',
    '\n': '%0a',
    '\r': '%0d',
}

def escape_xml(text):
    # Create a regular expression from all the keys
    pattern = re.compile('|'.join(re.escape(key) for key in escape_map.keys()))
    # For each match, replace with the corresponding value
    return pattern.sub(lambda match: escape_map[match.group(0)], text)

def get_current_datetime():
    return datetime.now().strftime('%Y-%m-%dT%H:%M:%S+00:00')

def read_input_file(filename):
    """Read input file, handling both regular and gzipped files"""
    if filename.endswith('.gz'):
        with gzip.open(filename, 'rt', encoding='utf-8') as f:
            return f.read().splitlines()
    else:
        with open(filename, 'r', encoding='utf-8') as f:
            return f.read().splitlines()

def write_sitemap_header(f):
    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')

def write_sitemap_footer(f):
    f.write('</urlset>\n')

def write_sitemap_index_header(f):
    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    f.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')

def write_sitemap_index_footer(f):
    f.write('</sitemapindex>\n')

def format_url_entry(path):
    escaped_path = escape_xml(path)
    s = '<url>'
    s += f'<loc>{base_url}{escaped_path}</loc>'
    s += '<changefreq>never</changefreq>'
    s += '</url>\n'
    return s

def write_sitemap_index_entry(f, sitemap_url):
    f.write('<sitemap>')
    f.write(f'<loc>{sitemap_url}</loc>')
    f.write(f'<lastmod>{get_current_datetime()}</lastmod>')
    f.write('</sitemap>\n')

def save_compressed_sitemap(buffer, filename):
    """Save StringIO buffer to a compressed file and reset buffer"""
    with gzip.open(filename, 'wt', encoding='utf-8') as f:
        f.write(buffer.getvalue())
    buffer.seek(0)
    buffer.truncate()

def main():
    # Read all paths from input files
    paths = []
    for input_file in input_file_list:
        try:
            paths += read_input_file(input_file)
        except FileNotFoundError:
            print(f"Error: Input file '{input_file}' not found")
            continue
        except Exception as e:
            print(f"Error reading input file: {str(e)}")
            continue

    # Initialize variables for splitting logic
    file_count = 1
    url_count = 0
    current_size = 0
    sitemap_files = []
    buffer = io.StringIO()

    write_sitemap_header(buffer)
    current_size = buffer.tell()
    # current_size = len(buffer.getvalue())

    for path in paths:
        url_entry = format_url_entry(path)
        # Check if we need to start a new file
        if (url_count >= max_urls_per_file or current_size + len(url_entry) > max_file_size):

            # Finalize current sitemap
            write_sitemap_footer(buffer)

            # Save compressed file
            output_file = f"{output_prefix}.{file_count}{output_extension}"
            save_compressed_sitemap(buffer, output_file)
            sitemap_files.append(output_file)
            print(f"writing {output_file}")

            # Start new sitemap
            file_count += 1
            url_count = 0
            write_sitemap_header(buffer)
            current_size = buffer.tell()
            # current_size = len(buffer.getvalue())

        # Write the URL entry
        buffer.write(url_entry)
        url_count += 1
        current_size = buffer.tell()
        # current_size = len(buffer.getvalue())

    # Finalize the last sitemap
    if url_count > 0:
        # Finalize current sitemap
        write_sitemap_footer(buffer)
        # Save compressed file
        output_file = f"{output_prefix}.{file_count}{output_extension}"
        save_compressed_sitemap(buffer, output_file)
        sitemap_files.append(output_file)
        print(f"writing {output_file}")

    # Generate the sitemap index file
    print(f"writing {sitemap_index_file}")
    with open(sitemap_index_file, 'w', encoding='utf-8') as index_file:
        write_sitemap_index_header(index_file)
        for sitemap_file in sitemap_files:
            sitemap_url = f"{base_url}{sitemap_file}"
            write_sitemap_index_entry(index_file, sitemap_url)
        write_sitemap_index_footer(index_file)

    # print(f"Generated {file_count} sitemap files")

if __name__ == '__main__':
    main()