Documentation for this module may be created at Module:Sandbox/trappist the monk/test/doc
--[[
this code is a test of an alternate method for dissecting IETF language tags. Because this is a hack, it is only
meant to be run from the debug console. To do that, in the console window below, type: =p.get_ietf_parts(<tag>)
]]
p={}
--[[--------------------------< P A R S E _ I E T F >----------------------------------------------------------
]]
local function parse_ietf (source, pattern_table)
local element = {}
for _, pattern in ipairs (pattern_table) do
element[1], element[2], element[3], element[4] = source:match (pattern);
if element[1] then
return element[1], element[2], element[3], element[4];
end
end
end
--[[--------------------------< G E T _ I E T F _ P A R T S >--------------------------------------------------
extracts and returns IETF language tag parts:
primary language subtag (required) - 2 or 3 character IANA language code
script subtag - four character IANA script code
region subtag - two-letter or three digit IANA region code
variant subtag - four digit or 5-8 alnum variant code
private subtag - x- followed by 1-8 alnum private code; only supported with the primary language tag
in any one of these forms
lang lang-variant
lang-script lang-script-variant
lang-region lang-region-variant
lang-script-region lang-script-region-variant
lang-x-private
each of lang, script, region, variant, and private, when used, must be valid
Languages with both two- and three-character code synonyms are promoted to the two-character synonym because
the IANA registry file omits the synonymous three-character code; we cannot depend on browsers understanding
the synonymous three-character codes in the lang= attribute.
For {{lang-xx}} templates, the parameters |script=, |region=, and |variant= are supported (not supported in {{lang}}
because those parameters are superfluous to the IETF subtags in |code=)
returns six values. Valid parts are returned as themselves; omitted parts are returned as empty strings, invalid
parts are returned as nil; the sixth returned item is an error message (if an error detected) or nil.
see http://www.rfc-editor.org/rfc/bcp/bcp47.txt section 2.1
]]
function p.get_ietf_parts (frame)
local code;
local script;
local region;
local variant;
local private;
local source = frame.args[1]; -- debug
local csrv_patterns = { -- code, script, region, variant patterns
'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-(%d%d%d%d)$', -- cc-Ssss-RR-variant (where variant is 4 digits)
'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-(%d%d%d%d)$', -- cc-Ssss-DDD-variant (where region is 3 digits; variant is 4 digits)
'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-Ssss-RR-variant (where variant is 5-8 alnum characters)
'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-Ssss-DDD-variant (where region is 3 digits; variant is 5-8 alnum characters)
}
local crv_patterns = { -- code, region, variant patterns
'^(%a%a%a?)%-(%a%a)%-(%d%d%d%d)$', -- cc-RR-variant (where variant is 4 digits)
'^(%a%a%a?)%-(%d%d%d)%-(%d%d%d%d)$', -- cc-DDD-variant (where region is 3 digits; variant is 4 digits)
'^(%a%a%a?)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-RR-variant (where variant is 5-8 alnum characters)
'^(%a%a%a?)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-DDD-variant (where region is 3 digits; variant is 5-8 alnum characters)
}
local csv_patterns = { -- code, script, variant patterns
'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d%d)$', -- cc-Ssss-variant (where variant is 4 digits)
'^(%a%a%a?)%-(%a%a%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-Ssss-variant (where variant is 5-8 alnum characters)
}
local csr_patterns = { -- code, script, region patterns
'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)$', -- cc-Ssss-RR
'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)$', -- cc-Ssss-DDD (where region is 3 digits)
}
local cv_patterns = { -- code, variant patterns
'^(%a%a%a?)%-(%d%d%d%d)$', -- cc-variant (where variant is 4 digits)
'^(%a%a%a?)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$', -- cc-variant (where variant is 5-8 alnum characters)
}
local cr_patterns = { -- code, region patterns
'^(%a%a%a?)%-(%a%a)$', -- cc-RR
'^(%a%a%a?)%-(%d%d%d)$', -- cc-DDD (region is 3 digits)
}
local cs_patterns = { -- code, script patterns
'^(%a%a%a?)%-(%a%a%a%a)$', -- cc-Ssss
}
local cp_patterns = { -- code, private-use patterns
'^(%a%a%a?)%-x%-([%a%d][%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?)$' -- cc-x-pppppppp where private is 1-8 alnum characters
}
local c_patterns = { -- code, private-use patterns
'^(%a%a%a?)$', -- cc
}
-- if not is_set (source) then
-- return nil, nil, nil, nil, nil, 'missing language tag';
-- end
local _, element_count = source:gsub ('%-', '')
if 0 == element_count then
code = source:match (c_patterns[1]); -- done this way because there is only one pattern
elseif 1 == element_count then
while true do -- spoof a loop so that we can use break to escape when a match is made
code, script = source:match (cs_patterns[1]); -- done this way because there is only one pattern
if code then break; end
code, region = parse_ietf (source, cr_patterns);
if code then break; end
code, variant = parse_ietf (source, cv_patterns);
break; -- terminal exit from loop spoof
end
elseif 2 == element_count then
while true do -- spoof a loop so that we can use break to escape when a match is made
code, script, region = parse_ietf (source, csr_patterns);
if code then break; end
code, script, variant = parse_ietf (source, csv_patterns);
if code then break; end
code, region, variant = parse_ietf (source, crv_patterns);
if code then break; end
code, private = source:match (cp_patterns[1]); -- the -x- counts as an element but we don't return that; done this way because there is only one pattern
break; -- terminal exit from loop spoof
end
elseif 3 == element_count then
code, script, region, variant = parse_ietf (source, csrv_patterns);
else
return table.concat ({'unrecognized language tag: ', source}); -- debug return
-- return nil, nil, nil, nil, nil, table.concat ({'unrecognized language tag: ', source}); -- don't know what we got but it is malformed - too many hyphens
end
if not code then
return nil, nil, nil, nil, nil, table.concat ({'unrecognized language tag: ', source}); -- don't know what we got but it is malformed
end
-- return code, script, region, variant, private; -- debug return
return table.concat ({code, script or '', region or '', variant or '', private or ''}, ', '); --debug return
end
return p;