#!/usr/bin/env python
# -*- coding: utf-8 -*-
# convert a file full of kana to romaji
# Copyright 2005 Ryan Schultz <schultz.ryan@gmail.com>
# GNU GPL v2 

import sys
import os
import re
import string
import codecs

# if it helps, imagine Art Metrano singing and doing magic tricks
# while reading this, because it's scary

print 'loading kana_list file'
f = codecs.open('kana_list', 'r', 'utf-8')
list = f.readlines()
kanalist = []
for line in list:
	t = line.split('  ')
	for e in t:
		t[t.index(e)] = e.strip().strip('\n')
	kanalist.append(t)

print 'opening document'
f = codecs.open(sys.argv[1], 'r', 'utf-8')
doc = f.read()

print 'subbing kana with romaji'
# reversed so that composite kana are subbed first
kanalist.reverse()
for kana in kanalist:
	doc = re.sub(kana[1], kana[0], doc)
	doc = re.sub(kana[2], kana[0], doc)

# double characters on sokuon (little tsu)
while True:
	result = string.find(doc, u'っ')
	if result == -1:
		break
	if doc[result+1] == '.' or doc[result+1] == ',' or doc[result+1] == '\n':
		doc = re.sub(doc[result], doc[result - 1], doc, 1)
	else:
		doc = re.sub(doc[result], doc[result + 1], doc, 1)
while True:
	result = string.find(doc, u'ッ')
	if result == -1:
		break
	if doc[result+1] == '.' or doc[result+1] == ',' or doc[result+1] == '\n':
		doc = re.sub(doc[result], doc[result - 1], doc, 1)
	else:
		doc = re.sub(doc[result], doc[result + 1], doc, 1)
	
f.close()

print 'writing converted file to %s' % (sys.argv[1]+'RMJI')
f = codecs.open(sys.argv[1]+'RMJI', 'w', 'utf-8')
f.write(doc)
f.close()
print 'done!'
