local export = {}

-- see the encoding below
local initials = {
	b = "p", p = "pʰ", m = "m", f = "f", v = "v", B = "pf", P = "pfʰ",
	d = "t", t = "tʰ", n = "n", l = "l",
	g = "k", k = "kʰ", N = "ŋ", h = "x",
	j = "t͡ɕ", q = "t͡ɕʰ", x = "ɕ",
	Z = "t͡ʂ", C = "t͡ʂʰ", S = "ʂ", r = "ʐ",
	z = "t͡s", c = "t͡sʰ", s = "s",
	[""] = "",
}

-- see the encoding below (U=ü, N=ng)
local finals = {
	a = "a", ia = "ia", ua = "ua",
	ar = "ɐr", iar = "iɐr", uar = "uɐr",
	o = "o", uo = "uo", Uo = "yo",
	er = "ər", uor = "uər",
	e = "ɤ",
	ue = "ɯ", ie = "iɛ", Ue = "yɛ",
	ier = "iɛr", Uer = "yɛr",
	ii = "z̩", ih = "ʐ̩", i = "i", u = "u", U = "y",
	iir = "ər", ihr = "ər", ir = "iər", ur = "uər", Ur = "yər",
	ai = "æ", iai = "iæ", uai = "uæ",
	air = "ær", iair = "iær", uair = "uær",
	ei = "ei", ui = "uei",
	eir = "er", uir = "uer",
	ao = "au", iao = "iau",
	aor = "ɔr", iaor = "iɔr",
	ou = "ɤu", iu = "iɤu",
	our = "ər", iur = "iər",
	an = "ã", ian = "iã", uan = "uã", Uan = "yã",
	anr = "ɐ̃r", ianr = "iɐ̃r", uanr = "uɐ̃r", Uanr = "yɐ̃r",
	en = "ẽ", ["in"] = "iẽ", un = "uẽ", Un = "yẽ",
	enr = "ə̃r", inr = "iə̃r", unr = "uə̃r", Unr = "yə̃r",
	aN = "aŋ", iaN = "iaŋ", uaN = "uaŋ",
	aNr = "ɐ̃r", iaNr = "iɐ̃r", uaNr = "uɐ̃r",
	eN = "əŋ", iN = "iŋ", oN = "uəŋ", ioN = "yoŋ",
	eNr = "ə̃r", iNr = "iə̃r", oNr = "uə̃r", ioNr = "yə̃r",
}

local tones = {
	["1"] = "²¹", --陰平(T1)
	["2"] = "²⁴", --陽平(T2)
	["3"] = "⁵³", --上(T3)
	["4"] = "⁵⁵", --去(T4)
	["5"] = "", -- toneless (T0)
}

-- internal use, encode and decode digraphs
local digraph_encode = {
	bv = "B", pf = "P", ng = "N", zh = "Z", ch = "C", sh = "S",
	["\204\140"] = "\1",
	["\204\129"] = "\2",
	["\204\128"] = "\3",
	["\204\132"] = "\4",
}
local digraph_decode = {
	B = "bv", P = "pf", N = "ng", Z = "zh", C = "ch", S = "sh", U = "ü",
	["\1"] = "\204\140",
	["\2"] = "\204\129",
	["\3"] = "\204\128",
	["\4"] = "\204\132",
	["\5"] = '<span style="background-color:#F5DEB3">',
	["\6"] = "</span>",
}
local function encode(text)
	text = mw.ustring.toNFD(text)
		:gsub("u\204\136","U")
		:gsub("[bpnzcs\204][vfgh\128\129\132\140]",digraph_encode)
	return text
end
local function decode(text)
	text = mw.ustring.toNFC(text:gsub("[BPNZCSU\1-\7]",digraph_decode))
	return text
end

local function py_join_syllables(text)
	text = text:gsub("'(\5?[bpmfvBPdtnlgkhjqxZCSrzcsyw])","%1"):gsub("ng","N")
	return text
end

local function py_divide_syllables(text)
	local res = text
		:gsub("([aeiouU\1-\4])N%f[aeiouU]","%1n'g")
		:gsub("[bpmfvBPdtnlgkNhjqxZCSrzcsyw][aeiouU]","'%0")
		:gsub("''+","'")
		:gsub("%f[^ %z]'","")
	local check = py_join_syllables(res)
	if text ~= check then
		error("Xi'an: error with apostrophes, "..decode(text).." should be "..decode(check)..".")
	end
	return res
end

local function py_put_tone(syllable, tone)
	syllable = syllable:gsub("[iuU]?[aeiouU]", "%0" .. (tone~="5" and string.char(tone) or ""), 1)
	return syllable
end

local function py_transf(syllable)
	local tone = tostring((syllable:match("[\1-\4]") or "\5"):byte(1))
	local syllable_detone, count = syllable:gsub("[\1-\4]","")
	if count > 1 then error("Xi'an: two tones in one syllable: " .. decode(syllable)) end
	local check = py_put_tone(syllable_detone,tone)
	if check ~= syllable then
		error("Xi'an: error with tone placement, "..decode(syllable).." should be "..decode(check)..".")
	end
	return tone .. syllable_detone
end

-- canonize to adhere to pinyin rules, e.g. jü -> ju
local function py_canonize(text)
	text = text
		:gsub("([jqx])U","%1u")
		:gsub("%f[%l%u]u[in]?",{u="w",ui="wei",un="wen"})
		:gsub("%f[%l%u]oN","weN")
		:gsub("w(r?)%f[^%l%u]","wu%1")
		:gsub("%f[%l%u]i[hu]?",{i="y",ih="ri",iu="you"})
		:gsub("y([nN]?r?)%f[^%l%u]","yi%1")
		:gsub("%f[%l%u]U","yu")
		:gsub("i[ih]","i")
	return text
end

-- normalize to initial+final, e.g. ju -> jü
local function py_normalize(text)
	local res = text
		:gsub("([jqx])u","%1U")
		:gsub("w[ue][inN]?",{wu="u",wei="ui",wen="un",weN="oN"})
		:gsub("w","u")
		:gsub("y[iuo]u?",{yi="i",yu="U",you="iu"})
		:gsub("y","i")
		:gsub("([zcs])i","%1ii")
		:gsub("([ZCSr])i","%1ih")
		:gsub("rih%f[^%l%u]","ih")
	local check = py_canonize(res)
	if text ~= check then
		error("Xi'an: invalid syllable: "..decode(text).." should be "..decode(check))
	end
	return res
end

local function py_to_ipa(text)
	text = text:gsub("[^ ]+",function(syllable)
		local a,b,c,d = syllable:match("^([12345])([bpmfvBPdtnlgkNhjqxZCSrzcs]?)([aeiouU][%lN]*)([12345]?)$")
		if not a then error("Xi'an: Invalid syllable: " .. decode(syllable)) end
		return (initials[b] or error("Xi'an: Invalid initial: " .. decode(b)))
			.. (finals[c] or error ("Xi'an: Invalid final: " .. decode(c)))
			.. tones[a]
			.. (d~="" and "⁻"..tones[d] or "")
		end)
	return "/" .. text .. "/"
end

-- returns (display_text, phonetic_text, ipa)
function export.py_process(text)
	local conv_display = {}
	local conv_hidden = {}
	local conv_ipa = {}
	local i = 0
	for reading in mw.text.gsplit(text,"/",true) do
		i = i + 1
		conv_display[i] = reading:gsub("[12345]","")
		-- no check is done for things like "xUān", any capitalisation is valid
		reading = mw.ustring.lower(reading)
		reading = encode(reading)
		reading = py_divide_syllables(reading)
		if reading:match("[12345]") then
			local phonetic = reading
				:gsub("([bpmfvBPdtnlgkNhjqxZCSrzcsyw]?[iuU]?[aeiouU])[\1-\4]?([%lN]*)([1-5])", function(a,b,c)
					return "\5" .. a .. (c~="5" and string.char(c) or "") .. b .. "\6"
				end)
			phonetic = py_join_syllables(phonetic)
			conv_hidden[i] = conv_display[i] .. " [Phonetic: " .. decode(phonetic) .. "]"
		else
			conv_hidden[i] = conv_display[i]
		end
		reading = reading:gsub("'"," "):gsub("[^ ]+",py_transf)
		reading = py_normalize(reading)
		conv_ipa[i] = py_to_ipa(reading)
	end
	return table.concat(conv_display, " / "),
		table.concat(conv_hidden, " / "),
		table.concat(conv_ipa, ", ")
end

return export