Editing Module:Sandbox/Erutuon

local p = {}

function p.show(frame)
	local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt"
	local text = assert(mw.title.new(page):getContent())
	local defaultIgnorable = text
		:match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s*\n# Total code points")
	local singles, ranges = {}, {}
	for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x*)") do
		codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16)
		local lastRange = ranges[#ranges]
		if lastRange and lastRange[2] == codePoint1 - 1 then
			lastRange[2] = codePoint2 or codePoint1
		else
			if not codePoint2 then
				singles[codePoint1] = true
			else
				table.insert(ranges, { codePoint1, codePoint2 })
			end
		end
	end
	
		local template = [[
local data = {}

data.defaultIgnorable = {
	singles = {
...
	},
	
	ranges = {
...
	},
}

return data
]]

	local Array = require "Module:array"
	local printedRanges = Array()
	for _, range in ipairs(ranges) do
		local low, high, script_code = unpack(range)
		printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high))
	end
	
	local printedSingles = Array()
	for codepoint in require 'Module:TableTools'.sortedPairs(singles) do
		printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint))
	end
	
	local data = template
		:gsub('%.%.%.', printedSingles:concat('\n'), 1)
		:gsub('%.%.%.', printedRanges:concat('\n'), 1)
	
	return data
end

local Unicode_data = require "Module:Unicode data/sandbox"
local fun = require "Module:fun"
local m_table = require "Module:TableTools"

local function errorf(level, ...)
	if type(level) == "number" then
		return error(string.format(...), level + 1)
	else -- level is actually the format string.
		return error(string.format(level, ...), 2)
	end
end

function p.search_for_language_codes(frame)
	local page_name = frame.args[1] or "English language"
	
	local success, title_object = pcall(mw.title.new, page_name)
	if not (success and title_object) then
		mw.logf("Could not make title object for '%s'.", page_name)
		return
	end
	
	local content = title_object:getContent()
	
	local language_codes = {}
	for lang_template in content:gmatch "{{lang[^}]+" do
		local template_name = lang_template:match("{{([^|}]+)")
		local language_code
		if template_name == "lang" then
			language_code = lang_template:match "{{lang|([^|}]+)"
		elseif template_name:find "^lang-" then
			language_code = lang_template:match "{{lang-([^|}]+)"
		end
		if language_code then
			language_codes[language_code] = true
		end
	end
	
	return table.concat(m_table.keysToList(language_codes), ", ")
end

local parsed_subtags_mt = {
	__index = {
		-- "error" is the error message.
		-- "index" is the ordinal of the subtag in which the error was found.
		throw = function (self, error, index)
			self.error = self.error_messages[error]
			self.invalid = table.concat(self.input, "-", index)
			return self:remove_unnecessary_fields()
		end,
		
		remove_unnecessary_fields = function (self)
			-- Only useful internally.
			self.input = nil
			self:pretty_print()
			p.validate_lang_tag(self)
			return self
		end,
		
		-- Regularize capitalization of language subtags:
		-- ZH-LATN -> zh-Latn, FR-ca -> fr-CA
		pretty_print = function (self)
			for key, func in pairs(self.print_funcs) do
				if self[key] then
					self[key] = func(self[key])
				end
			end
			return self
		end,
		
		-- Re-create the original tag from the parsed subtags.
		get_tag = function (self)
			if self.tag then return self.tag end
			
			local tag = {}
			for _, subtag_name in ipairs(self.subtag_order) do
				if subtag_name == "private_use" then
					table.insert(tag, "x")
				end
				
				if type(self[subtag_name]) == "table" then
					for _, subtag in ipairs(self[subtag_name]) do
						table.insert(tag, subtag)
					end
				else
					table.insert(tag, self[subtag_name])
				end
			end
			
			tag = table.concat(tag, "-")
			self.tag = tag -- Cache the result.
			
			return tag
		end,
		
		subtag_order = {
			"language", "script", "region", "variant", "private_use"
		},
		
		error_messages = {
			invalid_characters = "invalid characters",
			no_language = "no language subtag",
			invalid_subtag = "invalid subtag",
			invalid_private_use = "length of private-use subtag out of range",
			empty_private_use = "empty private-use subtag",
		}
	}
}
local function initial_caps_helper(initial, rest)
	return string.upper(initial) .. string.lower(rest)
end
local function lower_or_map_lower(str)
	if type(str) == "table" then
		return fun.map(string.lower, str)
	else
		return string.lower(str)
	end
end
parsed_subtags_mt.__index.print_funcs = {
	language = string.lower,
	script = function (script_code)
		return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper))
	end,
	region = string.upper,
	variant = lower_or_map_lower,
	private_use = lower_or_map_lower,
}

setmetatable(parsed_subtags_mt, {
	__call = function (self, input)
		return setmetatable({ input = input }, self)
	end
})
	
-- An array of patterns for each subtag, and a "type" field for the name
-- of the subtag.
-- The patterns are checked in order, and any of the subtags can be skipped.
-- So, for example, the "language" subtag must precede the "script"
-- subtag, but a tag may contain a "language" subtag, no "script" subtag
-- and then a "region" subtag.
-- If the full list of subtags has been iterated over, the remaining subtags
-- must match the pattern for a private-use subtag, or the tag is invalid.
local subtag_info = { -- can be put in data module
	{ "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case
	-- include extlang?
	{ "%a%a%a%a", type = "script" }, -- Ssss
	{ "%a%a", "%d%d%d", type = "region" }, -- rr, DDD
	{
		"%d%d%d%d", -- 4 digits
		"%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters
		type = "variant",
		repeatable = true, -- There can be multiple variants.
	}
}

-- A previous draft, in [[Module:Lang/sandbox]]:
-- https://en.wikipedia.org/w/index.php?oldid=812819217

-- Based on https://www.w3.org/International/articles/language-tags/.

-- Parse a language tag.
-- Returns nil if tag is not a string or empty.
-- Else returns a table with a map of subtag type to subtag for all subtags that
-- were parsed.
-- If there was an error, returns an "error" field with a description of the
-- error, and an "invalid" field with the suffix of the tag starting at the
-- index where the error occurred.

-- Does not recognize "extension" tags, such as those introduced by "u", as they
-- are not needed on Wikipedia. Does not recognize "grandfathered" tags.
-- Does not recognize extended language subtags, such as "zh-yue".
-- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47

-- Only checks that the syntax is correct, not that the values are valid. For
-- instance, will accept non-existent language codes, like "zz".
function p.parse_IETF(tag)
	if type(tag) ~= "string" or tag == ""  then
		return nil
	end
	
	-- This may contain the special fields "invalid", "error".
	-- "error" indicates why the
	-- tag is invalid (if applicable).
	-- All other fields are subtags, and they appear in the tag in the following
	-- order:
	-- "language", "script", "region", "variant", "private_use", "invalid"
	-- All these subtags can be strings or nil, while "variant" can also be an
	-- array of strings if more than one variant subtag was found.
	-- "invalid" is the portion of the tag after the last valid subtag (minus a
	-- hyphen).
	local segments = mw.text.split(tag, "-")
	local parsed_subtags = parsed_subtags_mt(segments)
	
	-- Language tags probably only contain ASCII alphabetic and numerical
	-- characters and hyphen-minus.
	if not tag:find "^[A-Za-z0-9-]+$" then
		return parsed_subtags:throw(
			"invalid_characters",
			fun.indexOf(
				function (tag)
					return tag:find "[^A-Za-z0-9-]"
				end,
				segments))
	end
	
	local subtag_i = 1 -- Index of current item in subtag_info.
	local segment_i = 1 -- Index of current segment.
	while segments[segment_i] and subtag_info[subtag_i] do
		local segment = segments[segment_i]
		local subtag_type
		while not subtag_type and subtag_info[subtag_i] do
			-- Check each pattern for the subtag type at "subtag_i" in "subtag_info".
			local cur_subtag = subtag_info[subtag_i]
			for _, pattern in ipairs(cur_subtag) do
				if segment:find("^" .. pattern .. "$") then
					subtag_type = cur_subtag.type
					-- There can be multiple "variant" subtags (and "extension"
					-- subtags, if those are added).
					if not cur_subtag.repeatable then
						subtag_i = subtag_i + 1
					end
					break
				end
			end
			
			if not subtag_type then -- No match; try next subtag.
				subtag_i = subtag_i + 1
			end
		end
		
		-- If language subtag has not been found, or the current segment has not
		-- been matched as a subtag, break the loop and check for
		-- a private-use subtag.
		if segment_i == 1 and subtag_type ~= "language" or not subtag_type then
			break
		else
			if parsed_subtags[subtag_type] then -- Create an array.
				if type(parsed_subtags[subtag_type]) == "string" then
					parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] }
				end -- else table
				table.insert(parsed_subtags[subtag_type], segment)
			else
				parsed_subtags[subtag_type] = segment
			end
			last_matched_segment_i = segment_i
		end
		
		segment_i = segment_i + 1
	end
	
	if segments[segment_i] then -- More segments to scan?
		-- Not all potential subtags were matched. Check for private-use subtags.
		-- https://tools.ietf.org/html/bcp47#section-2.2.7
		-- Private-use subtags consist of one or more sequences of 1 to 8
		-- alphanumeric characters preceded by "x-".
		-- Alphanumericity has already been checked.
		
		 -- A tag must start with either a language subtag or a private-use subtag.
		 -- If next segment is not "x", introducing a private-use subtag, there
		 -- is no private-use subtag.
		if segments[segment_i] and segments[segment_i]:lower() ~= "x" then
			if not parsed_subtags.language then
				return parsed_subtags:throw("no_language", 1)
			else
				return parsed_subtags:throw("invalid_subtag",
					segment_i)
			end
		elseif not segments[segment_i + 1] then
			return parsed_subtags:throw("empty_private_use",
				segment_i)
		end
		
		-- Check length of all segments after "x".
		for i = segment_i + 1, #segments do
			local length = #segments[i]
			
			if not (1 <= length and length <= 8) then
				return parsed_subtags
					:throw("invalid_private_use", segment_i)
			end
		end
		
		if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag.
			parsed_subtags.private_use = segments[segment_i + 1]
		else
			parsed_subtags.private_use = {}
			for i = segment_i + 1, #segments do
				table.insert(parsed_subtags.private_use, segments[i])
			end
		end
	end
	
	return parsed_subtags:remove_unnecessary_fields()
end


local lang_name_table = mw.loadData "Module:Language/name/data"
local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms"
local lang_data =  mw.loadData "Module:Lang/data"

function p.validate_lang_tag(parsed_subtags)
	-- Already checked that the tag starts with a language subtag or a private-use subtag.
	-- Script code is initially capitalized, region code is uppercase,
	-- everything else is lowercase.
	
	-- Check existence of language tag.
	if parsed_subtags.language and
			not (lang_data.override[parsed_subtags.language]
			or lang_name_table.lang[parsed_subtags.language]) then
		mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag())
	end
	
	-- Check existence of script tag.
	if parsed_subtags.script then
		local lower_script = parsed_subtags.script:lower()
		if not lang_name_table.script[lower_script] then
			mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag())
		end
		
		-- Check that script tag is not marked as superfluous (because the
		-- it is considered the default one for the language).
		if lang_name_table.suppressed[lower_script]
				and parsed_subtags.language
				and m_table.inArray(
					lang_name_table.suppressed[lower_script],
					parsed_subtags.language:lower()) then
			mw.log(parsed_subtags.script, "is suppressed with",
				parsed_subtags.language, "in", parsed_subtags:get_tag())
		end
	end
	
	-- Check existence of region code..
	if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then
		mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag())
	end
	
	-- Check that variant code is valid, and that it can validly be used with the
	-- given combination of language, script, region, and variant.
	-- Check for duplicate variant subtags?
	if parsed_subtags.variant then
		local lower_tag = parsed_subtags:get_tag():lower()
		
		for _, variant in ipairs(type(parsed_subtags.variant) == "table"
				and parsed_subtags.variant or { parsed_subtags.variant }) do
			if not lang_name_table.variant[variant] then
				mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag())
			else
				local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant)
				
				-- Check that at least one of the prefixes is found at the
				-- beginning of lower_tag.
				if not fun.some(function (prefix)
							return lower_tag:find(prefix, 1, true) == 1
						end,
						lang_name_table.variant[variant].prefixes) then
					mw.log("Variant tag", variant, "does not belong with prefix",
						prefix, "in", parsed_subtags:get_tag())
				end
			end
		end
	end
	
	-- Check that the private-use subtag is actually used by Wikipedia.
	if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then
		mw.log("Invalid private-use subtag in", parsed_subtags:get_tag())
	end
end

function p.show_COinS(frame)
	local ref = frame.args[1]
	
	local tag = ref:match('<span [^>]*class="Z3988"[^>]*>')
	local data = tag:match('title="(.-)"')
	
	local vals = {}
	
	for item in mw.text.gsplit(data, "&") do
		local key, value = item:match("(.-)=(.*)")
		vals[key] = mw.uri.decode(value)
	end
	
	return ref .. "\n\n" .. table.concat(
		require "Module:fun".mapIter(
			function (value, key)
				return ("%s: %s"):format(key, value)
			end,
			m_table.sortedPairs(
				vals)),
		", ")
end

return p