Jump to content

Module:character list

From Wiktionary, the free dictionary

This module uses Module:Unicode data to generate Appendix:Unicode and its subpages.


local m_unicode = require("Module:Unicode data")
local m_uni_alias = require("Module:Unicode data/aliases")

local Array = require("Module:array")

local char_to_script = require("Module:scripts").charToScript
local concat = table.concat
local get_block_range = m_unicode.get_block_range
local get_category_long_name = m_unicode.get_category_long_name
local get_script_alias = m_unicode.get_script_alias
local html_create = mw.html.create
local insert = table.insert
local is_assigned = m_unicode.is_assigned
local list_to_text = mw.text.listToText
local lookup_category = m_unicode.lookup_category
local lookup_script = m_unicode.lookup_script
local max = math.max
local min = math.min
local new_title = mw.title.new
local process_params = require("Module:parameters").process
local safe_require = require("Module:load").safe_require
local spell_number = require("Module:ConvertNumeric").spell_number
local u = require("Module:string utilities").char

local general_category_data = require("Module:Unicode data/category")
local general_category_aliases = general_category_data.long_names
local script_data = require("Module:Unicode data/scripts")

local content_lang = mw.language.getContentLanguage()

local export = {}

local Unicode_version = "16.0"

local function get_size(block_start, block_end)
	return block_end - (block_start - 1)
end

-- Large blocks have more than 0x1000 codepoints (1/16 of a plane).
local function is_large_block(block_start, block_end)
	return get_size(block_start, block_end) > 0x1000
end

-- Parse the page name to check if parameters can be generated automatically. This works for subpages of Appendix:Unicode, in the format "Appendix:Unicode/Block name". Large blocks are divided into sublists of (up to) 0x1000 characters, which are subpages of the block's page in the format "Appendix:Unicode/Block name/X000", where "X000" is the first codepoint of the sublist.
-- If the current page follows one of these formats, returns a table with the following keys:
-- `name` - the block name
-- `block_start` - the first codepoint in the block
-- `block_end` - the last codepoint in the block
-- If the page is a range subpage, the table will have two additional keys:
-- `range_start` - the first codepoint in the range
-- `range_end` - the last codepoint in the range
local function parse_page_name(title)
	title = title and new_title(title) or mw.title.getCurrentTitle()
	if title.namespace ~= 100 then
		return
	end
	local base_text, block_name = title.baseText
	local is_range_subpage = base_text ~= "Unicode"
	-- Appendix:Unicode/Block name/0000.
	if is_range_subpage then
		local base_title = new_title(base_text, 100)
		-- Block name must be a subpage of Appendix:Unicode.
		if base_title.baseText ~= "Unicode" then
			return
		end
		block_name = base_title.subpageText
	-- Appendix:Unicode/Block name.
	else
		block_name = title.subpageText
	end
	local block_start, block_end = get_block_range(block_name)
	if not (block_start and block_end) then
		return
	end
	local page_data = {
		name = block_name,
		block_start = block_start,
		block_end = block_end
	}
	if not is_range_subpage then
		return page_data
	end
	-- Range start must be 4/5/6-digit codepoint.
	local raw = title.subpageText
	local range_start = tonumber(raw, 16)
	page_data.range_start = range_start
	if not (
		-- Must be a hex number.
		range_start and
		-- Must be a 4/5/6-digit codepoint.
		raw == ("X"):format(range_start) and
		-- Must be within the block's range.
		range_start >= block_start and
		range_start <= block_end and
		-- Msut be a large block.
		is_large_block(block_start, block_end)
	) then
		return
	end
	local mod = range_start % 0x1000
	-- Must be the start of the block or a X000 codepoint.
	if not (range_start == block_start or range_start % 0x1000 == 0) then
		return
	end
	page_data.range_end = min(range_start - mod   0xFFF, block_end)
	return page_data
end

local function get_data_for_code_point_range(block_start, block_end, filterer)
	local cps = {}
	for cp = block_start, block_end do
		if not filterer or filterer(cp) then
			local data = {}
			data.aliases = m_uni_alias[cp]
			for _, item in ipairs { "name", "script", "category", "image", "image_emoji" } do
				data[item] = m_unicode["lookup_" .. item](cp)
			end
			data.cp = cp
			insert(cps, data)
		end
	end
	return cps
end

function export.block_list_t(frame)
	local required_num_param = {required = true, type = "number", allow_hex = true}
	local args = process_params(frame:getParent().args, {
		[1] = required_num_param,
		[2] = required_num_param,
	})

	local result = {}
	local start_codepoint, end_codepoint = args[1], args[2]
	
	insert(result, "{| class=\"wikitable\" style=\"width: 100%;\"\n! width=\"10%;\" | Start\n! width=\"10%;\" | End\n ! Block name\n")
	for _, name, block_start, block_end in m_unicode.enum_blocks() do
		if (block_start >= start_codepoint) and (block_end <= end_codepoint) then
			insert(result, (
				"|-\n|U X\n|U X\n|[[Appendix:Unicode/%s|%s]]\n"
			):format(block_start, block_end, name, name))
		end
	end
	insert(result, "|}")
	
	return concat(result)
end
export.show_blocks = export.block_list_t

-- Checks if all codepoints between `block_start` and `block_end` return the same result from a given lookup function, and returns that value if so. Otherwise, returns nil.
local function get_shared_value(i, j, lookup_func)
	local value
	for cp = i, j do
		-- Ignore unassigned codepoints.
		if is_assigned(cp) then
			local cp_value = lookup_func(cp)
			if value == nil then
				value = cp_value
			elseif value ~= cp_value then
				return
			end
		end
	end
	return value
end

local function navlink(target, display, left_arrow)
	return ("[[%s|%s %s]]"):format(
		target,
		left_arrow and "⟵" or display,
		left_arrow and display or "⟶"
	)
end

local function block_navlink(block_name, left_arrow, subpage)
	return block_name and navlink(
		("%s../%s"):format(subpage and "../" or "", block_name),
		block_name,
		left_arrow
	) or ""
end

local function subpage_navlink(range_start, range_end, left_arrow)
	return range_start and range_end and navlink(
		("../X"):format(range_start),
		("U X to U X"):format(range_start, range_end),
		left_arrow
	) or ""
end

local function return_header(text, name)
	return tostring(text) .. require("Module:utilities").format_categories({
		"Unicode blocks", name .. " block"},
		nil,
		name
	) .. require("Module:TemplateStyles")("Module:character list/styles.css")
end

function export.char_list_header_t(frame)
	local pagename = process_params(frame:getParent().args, {
		["pagename"] = {demo = true}
	}).pagename
	
	local page_data = parse_page_name(pagename)
	
	if not page_data then
		error("Page is not a valid subpage of [[Appendix:Unicode]].")
	end
	
	local name = page_data.name
	
	local prev_block, next_block, found
	for _, block_name in m_unicode.enum_blocks() do
		if block_name == name then
			found = true
		elseif found then
			next_block = block_name
			break
		else
			prev_block = block_name
		end
	end
	
	local block_start, block_end = page_data.block_start, page_data.block_end
	local block_size = get_size(block_start, block_end)
	local range_start, range_end = page_data.range_start, page_data.range_end
	local page_type = is_large_block(block_start, block_end) and (
		range_start and "range" or "large block"
	)
	
	local heading = html_create("td")
		:addClass("unicode-header-heading")
		:tag("h2")
			:wikitext(name)
			:done()
	
	if page_type == "range" then
		heading = heading:attr("rowspan", 2)
	end
	
	local tbl = html_create("table")
		:addClass("unicode-header-table")
		:tag("tr")
			:tag("td")
				:addClass("unicode-nav-button")
				:addClass("unicode-nav-button-left")
				:wikitext(block_navlink(prev_block, true, page_type == "range"))
				:done()
			:node(heading)
			:tag("td")
				:addClass("unicode-nav-button")
				:addClass("unicode-nav-button-right")
				:wikitext(block_navlink(next_block, false, page_type == "range"))
		:allDone()
	
	if page_type == "range" then
		tbl = tbl:tag("tr")
			:tag("td")
				:addClass("unicode-nav-button")
				:addClass("unicode-nav-button-left")
				:wikitext(subpage_navlink(
					range_start ~= block_start and max(block_start, range_start - 0x1000),
					range_start - 1,
					true
				))
				:done()
			:tag("td")
				:addClass("unicode-nav-button")
				:addClass("unicode-nav-button-right")
				:wikitext(subpage_navlink(
					range_end   1,
					range_end ~= block_end and min(block_end, range_end   0x1000),
					false
				))
			:allDone()
	end
	
	local text = html_create():node(tbl)
	local div = text:tag("div")
		:wikitext("This page lists ")
	
	if page_type == "range" then
		div:wikitext(("code points U X to U X from "):format(range_start, range_end))
	else
		div:wikitext("the characters in ")
	end
	
	div:wikitext(("the [http://unicode.org/charts/PDF/UX.pdf %s] block of the [[w:Unicode|Unicode]] standard (version %s), which covers %s code points from U X to U X"):format(
		block_start, name, Unicode_version, content_lang:formatNum(block_size), block_start, block_end
	))
	
	local general_category = get_shared_value(range_start or block_start, range_end or block_end, lookup_category)
	
	if general_category == "Cs" then
		div:wikitext(".")
		
		local div2 = text:tag("div")
			:css("margin-top", "0.5em")
			:wikitext("This block contains no character assignments, and is reserved for use by ")
		if name:lower():match("private use") then
			div2:wikitext("individual applications using ")
		end
		div2:wikitext("[[w:UTF-16|UTF-16]].")
		
		return return_header(text, name)
	end
	
	local assigned = 0
	for cp = range_start or block_start, range_end or block_end do
		if is_assigned(cp) then
			assigned = assigned   1
		end
	end
	
	if page_type ~= "range" then
		div:wikitext(("%s ha%s been assigned"):format(
			assigned == block_size and ", all of which" or ", of which " .. content_lang:formatNum(assigned),
			assigned == 1 and "s" or "ve"
		))
	end
	
	if general_category == "Co" then
		-- Supplementary Private Use Area-A & B. Kludgy, but fine unless more PUA areas are added.
		if assigned ~= block_size then
			div:wikitext((" (U X to U X)"):format(block_start, block_end - 2))
		end
		div:wikitext(".")
		
		text = text:tag("div")
			:css("margin-top", "0.5em")
			:wikitext("This block contains code points that purposely have no interpretation specified, and is reserved for use by individual applications.")
			:done()
		
		return return_header(text, name)
	end
	
	div:wikitext(".")
	
	local properties = {}
	
	if general_category then
		insert(properties, {
			"Unicode character property",
			"General_Category",
			general_category,
			get_category_long_name(general_category)
		})
	end
	
	local script = get_shared_value(range_start or block_start, range_end or block_end, lookup_script)
	if script then
		local property = {
			"Unicode script",
			"Script",
			script,
		}
		local alias = get_script_alias(script)
		local script_obj = require("Module:scripts").getByCode(script)
		insert(property, script_obj and ("[[w:%s|%s]]"):format(script_obj:getWikipediaArticle(), alias) or alias)
		insert(properties, property)
	end
	
	if #properties > 0 then
		local list = {}
		for i = 1, #properties do
			local property = properties[i]
			insert(list, tostring(html_create()
				:wikitext("the ")
				:tag("code")
					:wikitext(("[[w:%s|%s]]"):format(property[1], property[2]))
					:done()
				:wikitext(" value ")
				:tag("code")
					:wikitext(property[3])
					:done()
				:wikitext(" (")
				:tag("code")
					:wikitext(property[4])
					:done()
				:wikitext(")")
			))
		end
		
		text:tag("div")
			:css("margin-top", "0.5em")
			:wikitext(("All %scharacters in this %sblock have %s."):format(
				assigned == (page_type == "range" and get_size(range_start, range_end) or block_size) and "" or "assigned ",
				page_type == "range" and "portion of the " or "",
				list_to_text(list)
			))
	end
	
	if page_type == "large block" then
		local list = html_create("ul")
		
		local r_start, n = block_start - block_start % 0x1000, 0
		while r_start <= block_end do
			n = n   1
			local r_end = r_start   0xFFF
			local actual_start, actual_end = max(block_start, r_start), min(block_end, r_end)
			list = list:tag("li")
				:wikitext(("[[/X|U X (%s) to U X (%s)]]"):format(
					actual_start, actual_start, u(actual_start), actual_end, u(actual_end))
				)
				:done()
			r_start = r_end   1
		end
		
		text:tag("div")
			:css("margin-top", "0.5em")
			:wikitext((" Due to the size of the block, the list has been split across %s pages:"):format(spell_number(n)))
			:node(list)
			:done()
	end
	
	return return_header(text, name)
end
export.show_header = export.char_list_header_t

function export.char_list_t(frame)
	local parent = frame:getParent()
	local num_param = {type = "number", allow_hex = true}
	
	local args = process_params((parent and parent:getTitle() ~= mw.title.getCurrentTitle().fullText and parent or frame).args, {
		[1] = num_param,
		[2] = num_param,
		["block"] = {convert = function(block_name, err)
			local block_start, block_end = get_block_range(block_name)
			if not (block_start and block_end) then
				err("Invalid Unicode block specified")
			end
			return {
				block_start = block_start,
				block_end = block_end
			}
		end},
		["pagename"] = {demo = true}
	})

	local result = {}
	local block, range_start, range_end = args.block
	
	if block then
		range_start, range_end = block.block_start, block.block_end
	elseif args[1] and args[2] then
		range_start, range_end = args[1], args[2]
	else
		local page_data = parse_page_name(args.pagename)
		if not page_data then
			error("Must give a Unicode block or character range")
		end
		range_start = page_data.range_start or page_data.block_start
		range_end = page_data.range_end or page_data.block_end
	end
	
	local function present_codepoint(codepoint)
		if not m_unicode.is_printable(codepoint) then
			local character = u(codepoint)
			local text = "<small>(unprintable)</small>"
			if new_title(character) then
				return "[[" .. character .. "|" .. text .. "]]"
			else
				return text
			end
		end
		
		local link_target = m_unicode.get_entry_title(codepoint)

		local display = ("%s&#%u;"):format(m_unicode.is_combining(codepoint) and "&#x25cc;" or "", codepoint)
		if m_unicode.is_whitespace(codepoint) then
			display = "]" .. display .. "["	
		end

		return
			(link_target and "[[:%s|<span class=\"character-sample %s\">%s</span>]]"
			or "<!-- %s --><span class=\"character-sample %s\">%s</span>"):format(
				link_target or "", char_to_script(codepoint), display
			)
	end
	
	local cps = get_data_for_code_point_range(range_start, range_end, is_assigned)
	
	local emoji_image_exists = false
	
	local submodule = math.floor(range_start / 0x1000)
	local image_module = ("Module:Unicode data/images/X"):format(submodule)
	local emoji_image_module = ("Module:Unicode data/emoji images/X"):format(submodule)
	if safe_require(emoji_image_module) then
		for _, data in ipairs(cps) do
			if data.image_emoji then
				emoji_image_exists = true
				break
			end
		end
	end
	
	insert(result, [=[
{| class="wikitable sortable"
! width="12%" data-sort-type="number" | Code point
]=]
	)
	if emoji_image_exists then
		insert(result, [=[
! width="5%"  | Text-style image<br><sup>[[Special:EditPage/]=] .. image_module .. [=[|edit]]</sup>
! width="5%"  | Emoji-style image<br><sup>[[Special:EditPage/]=] .. emoji_image_module .. [=[|edit]]</sup>
]=]
		)
	else
		insert(result, [=[
! width="5%"  | Image<br><sup>[[Special:EditPage/]=] .. image_module .. [=[|edit]]</sup>
]=]
		)
	end
	insert(result, [=[
! width="5%"  | Character
]=]
	)
	local all_with_same_general_category = Array(cps)
		:all(function(data) return data.category == cps[1].category end)
	local all_with_same_script = Array(cps)
		:all(function(data) return data.script == cps[1].script end)
	if not all_with_same_general_category then
		insert(result, " ! [[w:General Category|General<br />Category]]\n")
	end
	if not all_with_same_script then
		insert(result, " ! [[w:Script (Unicode)|Script]]\n")
	end
	insert(result, " ! Name\n")
	
	for _, data in ipairs(cps) do
		local alt_names = ""
		local cp = data.cp
		
		if data.aliases then
			local aliases = {
				["correction"  ] = {},
				["control"     ] = {},
				["alternate"   ] = {},
				["figment"     ] = {},
				["abbreviation"] = {},
			}
			
			for _, info in ipairs(data.aliases) do
				insert(aliases[info[1]], "<small>" .. info[2] .. "</small>")
			end
			
			for _, name in ipairs(aliases.alternate) do
				alt_names = alt_names .. (" aka %s"):format(name)
			end

			if #aliases.control > 0 then
				alt_names = alt_names .. "; control character name: " .. concat(aliases.control, " or ")
			end

			for _, name in ipairs(aliases.correction) do
				alt_names = alt_names .. ("<br/>Corrected name: %s"):format(name)
			end
			
			for _, name in ipairs(aliases.figment) do
				alt_names = alt_names .. ("<br/>Figment name: %s"):format(name)
			end

			if #aliases.abbreviation > 0 then
				alt_names = alt_names .. " (" .. concat(aliases.abbreviation, ", ") .. ")"
			end
		end

		local current_image, current_image_emoji
		if data.image then
			current_image = ("[[File:%s|40x35px|class=skin-invert-image]]"):format(data.image)
		else
			current_image = ""
		end
		if emoji_image_exists then
			if data.image_emoji then
				current_image_emoji = ("[[File:%s|40x35px]]"):format(data.image_emoji)
			else
				current_image_emoji = ""
			end
		end
		
		insert(result, (
			" |- id=\"U-X\"\n" ..
			" | data-sort-value=\"%u\" | U X <small>(%u)</small>\n" ..
			" | %s \n"
		):format(
			cp, cp, cp, cp,
			current_image
		))
		if emoji_image_exists then
			insert(result, (
				" | %s \n"
			):format(
				current_image_emoji
			))
		end
		insert(result, (
			" | %s \n"
		):format(
			present_codepoint(cp),
			data.category .. "<br />(" ..
				general_category_aliases[data.category]:gsub("_", " ") ..
				")"
		))
		if not all_with_same_general_category then
			insert(result, (" | %s<br />(%s) \n")
				:format(
					data.category,
					general_category_aliases[data.category]:gsub("_", " ")))
		end
		if not all_with_same_script then
			insert(result, (" | %s<br />(%s) \n")
				:format(
					data.script,
					script_data.aliases[data.script]))
		end
		insert(result, (" | <small>%s</small>%s\n")
			:format(
				mw.text.nowiki(data.name),
				alt_names))
			
	end
	
	insert(result,
		" |}"
	)
	
	insert(result, require("Module:TemplateStyles")("Template:character info/style.css"))

	return concat(result)
end
export.show = export.char_list_t

return export