import re
import os
from pathlib import Path
from datetime import datetime
import gzip
import io
base_url = 'http://milahu.duckdns.org/'
max_urls_per_file = 50000
max_file_size = 50 * 1024 * 1024
input_file_list = [
'cas.files.txt.gz',
]
sitemap_index_file = 'sitemap.xml'
output_prefix = 'sitemap'
output_extension = '.xml.gz'
escape_map = {
'&': '&',
'"': '"',
'>': '>',
'<': '<',
"'": ''',
' ': '%20',
'\t': '%09',
'\n': '%0a',
'\r': '%0d',
}
def escape_xml(text):
pattern = re.compile('|'.join(re.escape(key) for key in escape_map.keys()))
return pattern.sub(lambda match: escape_map[match.group(0)], text)
def get_current_datetime():
return datetime.now().strftime('%Y-%m-%dT%H:%M:%S+00:00')
def read_input_file(filename):
"""Read input file, handling both regular and gzipped files"""
if filename.endswith('.gz'):
with gzip.open(filename, 'rt', encoding='utf-8') as f:
return f.read().splitlines()
else:
with open(filename, 'r', encoding='utf-8') as f:
return f.read().splitlines()
def write_sitemap_header(f):
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
def write_sitemap_footer(f):
f.write('</urlset>\n')
def write_sitemap_index_header(f):
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n')
def write_sitemap_index_footer(f):
f.write('</sitemapindex>\n')
def format_url_entry(path):
escaped_path = escape_xml(path)
s = '<url>'
s += f'<loc>{base_url}{escaped_path}</loc>'
s += '<changefreq>never</changefreq>'
s += '</url>\n'
return s
def write_sitemap_index_entry(f, sitemap_url):
f.write('<sitemap>')
f.write(f'<loc>{sitemap_url}</loc>')
f.write(f'<lastmod>{get_current_datetime()}</lastmod>')
f.write('</sitemap>\n')
def save_compressed_sitemap(buffer, filename):
"""Save StringIO buffer to a compressed file and reset buffer"""
with gzip.open(filename, 'wt', encoding='utf-8') as f:
f.write(buffer.getvalue())
buffer.seek(0)
buffer.truncate()
def main():
paths = []
for input_file in input_file_list:
try:
paths += read_input_file(input_file)
except FileNotFoundError:
print(f"Error: Input file '{input_file}' not found")
continue
except Exception as e:
print(f"Error reading input file: {str(e)}")
continue
file_count = 1
url_count = 0
current_size = 0
sitemap_files = []
buffer = io.StringIO()
write_sitemap_header(buffer)
current_size = buffer.tell()
for path in paths:
url_entry = format_url_entry(path)
if (url_count >= max_urls_per_file or current_size + len(url_entry) > max_file_size):
write_sitemap_footer(buffer)
output_file = f"{output_prefix}.{file_count}{output_extension}"
save_compressed_sitemap(buffer, output_file)
sitemap_files.append(output_file)
print(f"writing {output_file}")
file_count += 1
url_count = 0
write_sitemap_header(buffer)
current_size = buffer.tell()
buffer.write(url_entry)
url_count += 1
current_size = buffer.tell()
if url_count > 0:
write_sitemap_footer(buffer)
output_file = f"{output_prefix}.{file_count}{output_extension}"
save_compressed_sitemap(buffer, output_file)
sitemap_files.append(output_file)
print(f"writing {output_file}")
print(f"writing {sitemap_index_file}")
with open(sitemap_index_file, 'w', encoding='utf-8') as index_file:
write_sitemap_index_header(index_file)
for sitemap_file in sitemap_files:
sitemap_url = f"{base_url}{sitemap_file}"
write_sitemap_index_entry(index_file, sitemap_url)
write_sitemap_index_footer(index_file)
if __name__ == '__main__':
main()