Open main menu
Home
Random
Donate
Recent changes
Special pages
Community portal
Preferences
About Stockhub
Disclaimers
Search
User menu
Talk
Contributions
Create account
Log in
Editing
Module:Sandbox/Erutuon
Warning:
You are not logged in. Your IP address will be publicly visible if you make any edits. If you
log in
or
create an account
, your edits will be attributed to your username, along with other benefits.
Anti-spam check. Do
not
fill this in!
local p = {} function p.show(frame) local page = frame.args[1] or "User:Erutuon/Unicode/DerivedCoreProperties.txt" local text = assert(mw.title.new(page):getContent()) local defaultIgnorable = text :match("Derived Property: Default_Ignorable_Code_Point.-(%f[^\n]%x%x%x%x.-)%s*\n# Total code points") local singles, ranges = {}, {} for codePoint1, codePoint2 in defaultIgnorable:gmatch("%f[^\n%z](%x+)%.?%.?(%x*)") do codePoint1, codePoint2 = tonumber(codePoint1, 16), tonumber(codePoint2, 16) local lastRange = ranges[#ranges] if lastRange and lastRange[2] == codePoint1 - 1 then lastRange[2] = codePoint2 or codePoint1 else if not codePoint2 then singles[codePoint1] = true else table.insert(ranges, { codePoint1, codePoint2 }) end end end local template = [[ local data = {} data.defaultIgnorable = { singles = { ... }, ranges = { ... }, } return data ]] local Array = require "Module:array" local printedRanges = Array() for _, range in ipairs(ranges) do local low, high, script_code = unpack(range) printedRanges:insert(('\t\t{ 0x%05X, 0x%05X },'):format(low, high)) end local printedSingles = Array() for codepoint in require 'Module:TableTools'.sortedPairs(singles) do printedSingles:insert(('\t\t[0x%05X] = true,'):format(codepoint)) end local data = template :gsub('%.%.%.', printedSingles:concat('\n'), 1) :gsub('%.%.%.', printedRanges:concat('\n'), 1) return data end local Unicode_data = require "Module:Unicode data/sandbox" local fun = require "Module:fun" local m_table = require "Module:TableTools" local function errorf(level, ...) if type(level) == "number" then return error(string.format(...), level + 1) else -- level is actually the format string. return error(string.format(level, ...), 2) end end function p.search_for_language_codes(frame) local page_name = frame.args[1] or "English language" local success, title_object = pcall(mw.title.new, page_name) if not (success and title_object) then mw.logf("Could not make title object for '%s'.", page_name) return end local content = title_object:getContent() local language_codes = {} for lang_template in content:gmatch "{{lang[^}]+" do local template_name = lang_template:match("{{([^|}]+)") local language_code if template_name == "lang" then language_code = lang_template:match "{{lang|([^|}]+)" elseif template_name:find "^lang-" then language_code = lang_template:match "{{lang-([^|}]+)" end if language_code then language_codes[language_code] = true end end return table.concat(m_table.keysToList(language_codes), ", ") end local parsed_subtags_mt = { __index = { -- "error" is the error message. -- "index" is the ordinal of the subtag in which the error was found. throw = function (self, error, index) self.error = self.error_messages[error] self.invalid = table.concat(self.input, "-", index) return self:remove_unnecessary_fields() end, remove_unnecessary_fields = function (self) -- Only useful internally. self.input = nil self:pretty_print() p.validate_lang_tag(self) return self end, -- Regularize capitalization of language subtags: -- ZH-LATN -> zh-Latn, FR-ca -> fr-CA pretty_print = function (self) for key, func in pairs(self.print_funcs) do if self[key] then self[key] = func(self[key]) end end return self end, -- Re-create the original tag from the parsed subtags. get_tag = function (self) if self.tag then return self.tag end local tag = {} for _, subtag_name in ipairs(self.subtag_order) do if subtag_name == "private_use" then table.insert(tag, "x") end if type(self[subtag_name]) == "table" then for _, subtag in ipairs(self[subtag_name]) do table.insert(tag, subtag) end else table.insert(tag, self[subtag_name]) end end tag = table.concat(tag, "-") self.tag = tag -- Cache the result. return tag end, subtag_order = { "language", "script", "region", "variant", "private_use" }, error_messages = { invalid_characters = "invalid characters", no_language = "no language subtag", invalid_subtag = "invalid subtag", invalid_private_use = "length of private-use subtag out of range", empty_private_use = "empty private-use subtag", } } } local function initial_caps_helper(initial, rest) return string.upper(initial) .. string.lower(rest) end local function lower_or_map_lower(str) if type(str) == "table" then return fun.map(string.lower, str) else return string.lower(str) end end parsed_subtags_mt.__index.print_funcs = { language = string.lower, script = function (script_code) return (string.gsub(script_code, "^(%a)(%a%a%a)$", initial_caps_helper)) end, region = string.upper, variant = lower_or_map_lower, private_use = lower_or_map_lower, } setmetatable(parsed_subtags_mt, { __call = function (self, input) return setmetatable({ input = input }, self) end }) -- An array of patterns for each subtag, and a "type" field for the name -- of the subtag. -- The patterns are checked in order, and any of the subtags can be skipped. -- So, for example, the "language" subtag must precede the "script" -- subtag, but a tag may contain a "language" subtag, no "script" subtag -- and then a "region" subtag. -- If the full list of subtags has been iterated over, the remaining subtags -- must match the pattern for a private-use subtag, or the tag is invalid. local subtag_info = { -- can be put in data module { "%a%a%a?", "1%a+", type = "language" }, -- ll or lll; special case -- include extlang? { "%a%a%a%a", type = "script" }, -- Ssss { "%a%a", "%d%d%d", type = "region" }, -- rr, DDD { "%d%d%d%d", -- 4 digits "%w%w%w%w%w%w?%w?%w?", -- 5-8 alnum characters type = "variant", repeatable = true, -- There can be multiple variants. } } -- A previous draft, in [[Module:Lang/sandbox]]: -- https://en.wikipedia.org/w/index.php?oldid=812819217 -- Based on https://www.w3.org/International/articles/language-tags/. -- Parse a language tag. -- Returns nil if tag is not a string or empty. -- Else returns a table with a map of subtag type to subtag for all subtags that -- were parsed. -- If there was an error, returns an "error" field with a description of the -- error, and an "invalid" field with the suffix of the tag starting at the -- index where the error occurred. -- Does not recognize "extension" tags, such as those introduced by "u", as they -- are not needed on Wikipedia. Does not recognize "grandfathered" tags. -- Does not recognize extended language subtags, such as "zh-yue". -- https://www.rfc-editor.org/rfc/rfc6067.txt, https://tools.ietf.org/html/bcp47 -- Only checks that the syntax is correct, not that the values are valid. For -- instance, will accept non-existent language codes, like "zz". function p.parse_IETF(tag) if type(tag) ~= "string" or tag == "" then return nil end -- This may contain the special fields "invalid", "error". -- "error" indicates why the -- tag is invalid (if applicable). -- All other fields are subtags, and they appear in the tag in the following -- order: -- "language", "script", "region", "variant", "private_use", "invalid" -- All these subtags can be strings or nil, while "variant" can also be an -- array of strings if more than one variant subtag was found. -- "invalid" is the portion of the tag after the last valid subtag (minus a -- hyphen). local segments = mw.text.split(tag, "-") local parsed_subtags = parsed_subtags_mt(segments) -- Language tags probably only contain ASCII alphabetic and numerical -- characters and hyphen-minus. if not tag:find "^[A-Za-z0-9-]+$" then return parsed_subtags:throw( "invalid_characters", fun.indexOf( function (tag) return tag:find "[^A-Za-z0-9-]" end, segments)) end local subtag_i = 1 -- Index of current item in subtag_info. local segment_i = 1 -- Index of current segment. while segments[segment_i] and subtag_info[subtag_i] do local segment = segments[segment_i] local subtag_type while not subtag_type and subtag_info[subtag_i] do -- Check each pattern for the subtag type at "subtag_i" in "subtag_info". local cur_subtag = subtag_info[subtag_i] for _, pattern in ipairs(cur_subtag) do if segment:find("^" .. pattern .. "$") then subtag_type = cur_subtag.type -- There can be multiple "variant" subtags (and "extension" -- subtags, if those are added). if not cur_subtag.repeatable then subtag_i = subtag_i + 1 end break end end if not subtag_type then -- No match; try next subtag. subtag_i = subtag_i + 1 end end -- If language subtag has not been found, or the current segment has not -- been matched as a subtag, break the loop and check for -- a private-use subtag. if segment_i == 1 and subtag_type ~= "language" or not subtag_type then break else if parsed_subtags[subtag_type] then -- Create an array. if type(parsed_subtags[subtag_type]) == "string" then parsed_subtags[subtag_type] = { parsed_subtags[subtag_type] } end -- else table table.insert(parsed_subtags[subtag_type], segment) else parsed_subtags[subtag_type] = segment end last_matched_segment_i = segment_i end segment_i = segment_i + 1 end if segments[segment_i] then -- More segments to scan? -- Not all potential subtags were matched. Check for private-use subtags. -- https://tools.ietf.org/html/bcp47#section-2.2.7 -- Private-use subtags consist of one or more sequences of 1 to 8 -- alphanumeric characters preceded by "x-". -- Alphanumericity has already been checked. -- A tag must start with either a language subtag or a private-use subtag. -- If next segment is not "x", introducing a private-use subtag, there -- is no private-use subtag. if segments[segment_i] and segments[segment_i]:lower() ~= "x" then if not parsed_subtags.language then return parsed_subtags:throw("no_language", 1) else return parsed_subtags:throw("invalid_subtag", segment_i) end elseif not segments[segment_i + 1] then return parsed_subtags:throw("empty_private_use", segment_i) end -- Check length of all segments after "x". for i = segment_i + 1, #segments do local length = #segments[i] if not (1 <= length and length <= 8) then return parsed_subtags :throw("invalid_private_use", segment_i) end end if not segments[last_matched_segment_i + 3] then -- There is only one private-use subtag. parsed_subtags.private_use = segments[segment_i + 1] else parsed_subtags.private_use = {} for i = segment_i + 1, #segments do table.insert(parsed_subtags.private_use, segments[i]) end end end return parsed_subtags:remove_unnecessary_fields() end local lang_name_table = mw.loadData "Module:Language/name/data" local synonym_table = mw.loadData "Module:Lang/ISO 639 synonyms" local lang_data = mw.loadData "Module:Lang/data" function p.validate_lang_tag(parsed_subtags) -- Already checked that the tag starts with a language subtag or a private-use subtag. -- Script code is initially capitalized, region code is uppercase, -- everything else is lowercase. -- Check existence of language tag. if parsed_subtags.language and not (lang_data.override[parsed_subtags.language] or lang_name_table.lang[parsed_subtags.language]) then mw.log("Invalid language code", parsed_subtags.language, "in", parsed_subtags:get_tag()) end -- Check existence of script tag. if parsed_subtags.script then local lower_script = parsed_subtags.script:lower() if not lang_name_table.script[lower_script] then mw.log("Invalid script code", parsed_subtags.script, "in", parsed_subtags:get_tag()) end -- Check that script tag is not marked as superfluous (because the -- it is considered the default one for the language). if lang_name_table.suppressed[lower_script] and parsed_subtags.language and m_table.inArray( lang_name_table.suppressed[lower_script], parsed_subtags.language:lower()) then mw.log(parsed_subtags.script, "is suppressed with", parsed_subtags.language, "in", parsed_subtags:get_tag()) end end -- Check existence of region code.. if parsed_subtags.region and not lang_name_table.region[parsed_subtags.region:lower()] then mw.log("Invalid region code", parsed_subtags.region, "in", parsed_subtags:get_tag()) end -- Check that variant code is valid, and that it can validly be used with the -- given combination of language, script, region, and variant. -- Check for duplicate variant subtags? if parsed_subtags.variant then local lower_tag = parsed_subtags:get_tag():lower() for _, variant in ipairs(type(parsed_subtags.variant) == "table" and parsed_subtags.variant or { parsed_subtags.variant }) do if not lang_name_table.variant[variant] then mw.log("Invalid variant code", variant, "in", parsed_subtags:get_tag()) else local prefix = parsed_subtags:get_tag():lower():match("^(.-)%-" .. variant) -- Check that at least one of the prefixes is found at the -- beginning of lower_tag. if not fun.some(function (prefix) return lower_tag:find(prefix, 1, true) == 1 end, lang_name_table.variant[variant].prefixes) then mw.log("Variant tag", variant, "does not belong with prefix", prefix, "in", parsed_subtags:get_tag()) end end end end -- Check that the private-use subtag is actually used by Wikipedia. if parsed_subtags.private_use and not lang_data.override[parsed_subtags.tag] then mw.log("Invalid private-use subtag in", parsed_subtags:get_tag()) end end function p.show_COinS(frame) local ref = frame.args[1] local tag = ref:match('<span [^>]*class="Z3988"[^>]*>') local data = tag:match('title="(.-)"') local vals = {} for item in mw.text.gsplit(data, "&") do local key, value = item:match("(.-)=(.*)") vals[key] = mw.uri.decode(value) end return ref .. "\n\n" .. table.concat( require "Module:fun".mapIter( function (value, key) return ("%s: %s"):format(key, value) end, m_table.sortedPairs( vals)), ", ") end return p
Summary:
Please note that all contributions to Stockhub may be edited, altered, or removed by other contributors. If you do not want your writing to be edited mercilessly, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see
Stockhub:Copyrights
for details).
Do not submit copyrighted work without permission!
Cancel
Editing help
(opens in new window)
Templates used on this page:
Module:Fun
(
edit
)
Module:Language/name/data
(
edit
)
Module:Sandbox/Erutuon
(
edit
)
Module:Sandbox/Erutuon/doc
(
edit
)
Module:Sandbox/Erutuon/testcases
(
edit
)
Module:TableTools
(
edit
)
Module:Unicode data/sandbox
(
edit
)
Module:UnitTests
(
edit
)