Zanurkuj w Pythonie/Źródła/unicode2koi8r.py

Z Wikibooks, biblioteki wolnych podręczników.
"""Convert Cyrillic from iso-8859-1 Unicode-encoded to KOI8-R-encoded

This script is used during the build process of the Russian translation
of "Dive Into Python" (http://diveintopython.org/).

It takes one argument, which can be either an HTML file or a directory.
If a file, it converts the file in place; if a directory, it converts
every HTML file in the immediate directory (but not recursively).

Safe but pointless to run more than once on the same file or directory.
"""

__author__ = "Mark Pilgrim (mark@diveintopython.org)"
__version__ = "$Revision: 1.2 $"
__date__ = "$Date: 2004/05/05 21:57:19 $"
__copyright__ = "Copyright (c) 2001 Mark Pilgrim"
__license__ = "Python"

import os
import sys
import re

unicodeToKOI8R = { \
	'Ё': '\xb3',
	'А': '\xe1',
	'Б': '\xe2',
	'В': '\xf7',
	'Г': '\xe7',
	'Д': '\xe4',
	'Е': '\xe5',
	'Ж': '\xf6',
	'З': '\xfa',
	'И': '\xe9',
	'Й': '\xea',
	'К': '\xeb',
	'Л': '\xec',
	'М': '\xed',
	'Н': '\xee',
	'О': '\xef',
	'П': '\xf0',
	'Р': '\xf2',
	'С': '\xf3',
	'Т': '\xf4',
	'У': '\xf5',
	'Ф': '\xe6',
	'Х': '\xe8',
	'Ц': '\xe3',
	'Ч': '\xfe',
	'Ш': '\xfb',
	'Щ': '\xfd',
	'Ъ': '\xff',
	'Ы': '\xf9',
	'Ь': '\xf8',
	'Э': '\xfc',
	'Ю': '\xe0',
	'Я': '\xf1',
	'а': '\xc1',
	'б': '\xc2',
	'в': '\xd7',
	'г': '\xc7',
	'д': '\xc4',
	'е': '\xc5',
	'ж': '\xd6',
	'з': '\xda',
	'и': '\xc9',
	'й': '\xca',
	'к': '\xcb',
	'л': '\xcc',
	'м': '\xcd',
	'н': '\xce',
	'о': '\xcf',
	'п': '\xd0',
	'р': '\xd2',
	'с': '\xd3',
	'т': '\xd4',
	'у': '\xd5',
	'ф': '\xc6',
	'х': '\xc8',
	'ц': '\xc3',
	'ч': '\xde',
	'ш': '\xdb',
	'щ': '\xdd',
	'ъ': '\xdf',
	'ы': '\xd9',
	'ь': '\xd8',
	'э': '\xdc',
	'ю': '\xc0',
	'я': '\xd1',
	'ё': '\xa3' }

unicodePattern = re.compile(r'&#[0-9]{4,4};')
charsetPattern = re.compile(r'ISO-8859-1', re.IGNORECASE)

def translateMatch(match):
	unicode = match.group(0)
	if unicodeToKOI8R.has_key(unicode):
		return unicodeToKOI8R[unicode]
	else:
		return unicode

def translateBuffer(buffer):
	buffer = unicodePattern.sub(translateMatch, buffer)
	buffer = charsetPattern.sub('KOI8-R', buffer)
	return buffer

def translateFile(filename, outfilename=None):
	if not outfilename:
		outfilename = filename
	fsock = open(filename)
	buffer = fsock.read()
	fsock.close()
	buffer = translateBuffer(buffer)
	fsock = open(outfilename, 'wb')
	fsock.write(buffer)
	fsock.close()

def htmlFilter(filename):
	return os.path.splitext(filename)[1] == '.html'

def translateDirectory(directoryname, filterFunc=htmlFilter):
	fileList = [os.path.join(directoryname, f) for f in os.listdir(directoryname)]
	fileList = filter(filterFunc, fileList)
	map(translateFile, fileList)

if __name__ == "__main__":
	name = sys.argv[1]
	if os.path.isdir(name):
		translateDirectory(name)
	else:
		translateFile(name)