This module will sort text in the อักษรจีน. It is used to sort Ai-Cham, อามามิโอชิมะใต้, Central Bai, Panyi Bai, Southern Bai, Biao-Jiao Mien, Biyo, หมิ่นตะวันออก, จิ้น, จีนกลาง, ผิงเหนือ, Chinese Pidgin English, Puxian Min, Macau Pidgin Portuguese, ผิงใต้, Huizhou, หมิ่นตอนกลาง, ดุงกาน, Daur, E, กั้น, แคะ, Yemaek, ไหหลำ, เซียง, ญี่ปุ่น, Hachijō, Kikai, Lama Bai, จีนยุคกลาง, หล่อยแอว๋, จีนวรรณกรรม, Jie, Rouran, Tuyuhun, Tuoba, Wuhuan, Xianbei, เวียดนามกลาง, Caolan, หมิ่นเหนือ, ร่วม, มิยาโกะ, หมิ่นใต้, Datian Min, ฮกเกี้ยน, Hailufeng Min, Longyan Min, แต้จิ๋ว, Zhenan Min, Sanxiang Min, นุง, จีนเก่า, ญี่ปุ่นเก่า, โอกิโนเอราบุ, อุยกูร์เก่า, ปู้อี, Baekje, อามามิโอชิมะเหนือ, ยาเอยามะ, โอกินาวะ, สุ่ย, Bailang, โทกูโนชิมะ, Alchuka, Bala, Kyakala, ตั่ย, เวียดนาม, อู๋, Waxiang, ทิเบตคลาสสิก, มองโกเลียกลาง, Buyeo, คูนิงามิ, โยนางูนิ, โยรง, กวางตุ้ง, จ้วง, Zauzou, จีน, Shaozhou Tuhua, เสฉวน, ห่อยซัน, Goguryeo, Zakhring, คีตัน, and Gaya. The module should preferably not be called directly from templates or other modules. To use it from a template, use {{sortkey}}. Within a module, use Module:languages#Language:makeSortKey.

For testcases, see Module:Hani-sortkey/testcases.

Functions

makeSortKey(text, lang, sc)
Generates a sortkey for a given piece of text written in the script specified by the code sc, and language specified by the code lang.
When the sort fails, returns nil.

The demonstration functions that generated the content shown below are housed in Module:Hani-sortkey/templates. Modifications to the module can be tested in Module:Hani-sortkey/sandbox. Sortkeys for individual characters are retrieved from one of 178 data modules. Module:Hani-sortkey/data creates documentation for these modules.

Show sortkeys

แก้ไข
  • PS/2接口 (PS2手08口00)
  • gas爐 (gas火16)
  • γ粒子 (γ米05子00)
  • 命裡有時終須有,命裡無時莫強求 (口05衣07月02日06糸05頁03月02口05衣07火08日06艸07弓08水02)
  • 得個……字 (彳08人08子03)
  • 濕𣲷𣲷 (水14水05水05)
  • 赛车赛车 (貝10車00)

Ideographic description sequences

แก้ไข
  • ⿰亻革 (⿰人00革00)
  • ⿰亻革家語 (⿰人00革00宀07言07)
  • ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵 (⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00麥09)

local export = {}

local m_str_utils = require("Module:string utilities")

local byte = string.byte
local codepoint = m_str_utils.codepoint
local concat = table.concat
local convert_iteration_marks = require("Module:Hani").convert_iteration_marks
local explode = m_str_utils.explode_utf8
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local sub = string.sub
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
local upper = m_str_utils.upper

local m_data = require("Module:Hani-sortkey/data/serialized")
local m_data_core = mw.loadData("Module:Hani-sortkey/data/core")
local cache = {}

--[[
	Returns the index in the string where the ideographic description sequence
	(IDS) ends, or the index of the end of the string. Iterates whenever
	another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
	if not (text and IDchar and i) then
		return nil
	end
	
	local j = i
	local component = 1
	
	-- Number of components expected after current IDC.
	local components = m_data_core.ids[IDchar]
	
	while component <= components do
		j = j + 1
		
		local char = text[j]
		
		if not char then
			break
		elseif m_data_core.ids[char] then
			j = findEndOfIDS(text, char, j)
		end
		
		component = component + 1
	end
	
	--[[
		If the expected number of components has been found,
		return the current index in the text.
	]]
	if component - components == 1 then
		return j
	else
		return nil
	end
end

local function unserialize(a, b)
	return m_data_core.radicals[byte(a)] .. format("%02d", byte(b) - 10)
end

-- The data is stored in [[Module:Hani-sortkey/data]]. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as [[Module:Hani-sortkey/data/serialized]]. If the data is changed, the new serialized data can be generated with [[Module:Hani-sortkey/data/serializer]].
function export.getData(char)
	if type(char) == "string" then
		char = codepoint(char)
	elseif type(char) ~= "number" then
		error("getData must operate on a single character or codepoint.")
	end
	local offset, s, f, lookup = 0
	for i = 2, m_data_core.ranges.n, 2 do
		s, f = m_data_core.ranges[i - 1], m_data_core.ranges[i]
		if char > f then
			offset = offset + f - s + 1
		elseif char >= s and char <= f then
			lookup = 2 * (offset + char - s + 1)
			return (gsub(sub(m_data, lookup - 1, lookup), "(.)(.)", unserialize))
		end
	end
	return u(char)
end

function export.makeSortKey(text, lang, sc)
	local scripts = {
		Hani = true,
		Hans = true,
		Hant = true,
		Jpan = true,
		Kore = true
	}
	if sc and not scripts[sc] then
		return upper(text)
	end
	
-- Convert any iteration marks into full characters, and remove any spaces. Also remove punctuation if the term contains non-punctuation (so that entries for punctuation characters can still be sorted properly).
	text = ugsub(convert_iteration_marks(text), "%s+", "")
	if not umatch(text, "^%p+$") then
		text = ugsub(text, "%p+", "")
	end
	
	text = explode(text)
	local sort, text_len, i = {}, #text, 0
	while i < text_len do
		i = i + 1
		local char = text[i]
		
		if m_data_core.preconvert[char] then
			local j = 0
			for c in gmatch(m_data_core.preconvert[char], ".[\128-\191]*") do
				if j == 0 then
					text[i] = c
				else
					insert(text, i + j, c)
				end
				j = j + 1
			end
			char = text[i]
			text_len = #text
		end
		--[=[
			If we encounter an ideographic description character (IDC),
			find out if it begins a valid ideographic description sequence (IDS).
			
			If the IDS is valid and a sortkey for it is listed in
			[[Module:Hani-sortkey/data/unsupported]], then return
			the sortkey, and move to the next character after the
			IDS.
			
			Otherwise, insert the IDC into the sortkey and move to the next
			character after the IDC.
			
			If the IDS is valid and no sortkey for it is found, track it.
		]=]
		if m_data_core.ids[char] then
			local j = findEndOfIDS(text, char, i)
			local IDS, data
			if j then
				IDS = concat(text, nil, i, j)
				data = m_data_core.unsupported[IDS]
			end
			
			if not data then
				if IDS then
					require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
					mw.log("ideographic description sequence without sortkey: '"
						.. IDS .. "'")
				else
					require("Module:debug").track("Hani-sortkey/invalid-IDS")
					mw.log("invalid ideographic description sequence at the beginning of '"
						.. text[i] .. "'")
				end
			end
			if IDS and data then
				insert(sort, data)
				i = j
			else
				insert(sort, char)
			end
		else
			if not cache[char] then
				cache[char] = export.getData(char)
			end
			insert(sort, cache[char])
		end
	end
	
	return concat(sort)
end

return export