Documentation for this module may be created at Module:Sandbox/trappist the monk/test/doc

--[[
this code is a test of an alternate method for dissecting IETF language tags.  Because this is a hack, it is only
meant to be run from the debug console.  To do that, in the console window below, type: =p.get_ietf_parts(<tag>)
]]

p={}



--[[--------------------------< P A R S E _ I E T F >----------------------------------------------------------



]]

local function parse_ietf (source, pattern_table)
	local element = {}
	
	for _, pattern in ipairs (pattern_table) do
		element[1], element[2], element[3], element[4] = source:match (pattern);
		if element[1] then
			return element[1], element[2], element[3], element[4];
		end
	end
end


--[[--------------------------< G E T _ I E T F _ P A R T S >--------------------------------------------------

extracts and returns IETF language tag parts:
	primary language subtag (required) - 2 or 3 character IANA language code
	script subtag - four character IANA script code
	region subtag - two-letter or three digit IANA region code
	variant subtag - four digit or 5-8 alnum variant code
	private subtag - x- followed by 1-8 alnum private code; only supported with the primary language tag

in any one of these forms
	lang					lang-variant
	lang-script				lang-script-variant
	lang-region				lang-region-variant
	lang-script-region		lang-script-region-variant
	lang-x-private	
	
each of lang, script, region, variant, and private, when used, must be valid

Languages with both two- and three-character code synonyms are promoted to the two-character synonym because
the IANA registry file omits the synonymous three-character code; we cannot depend on browsers understanding
the synonymous three-character codes in the lang= attribute.

For {{lang-xx}} templates, the parameters |script=, |region=, and |variant= are supported (not supported in {{lang}}
because those parameters are superfluous to the IETF subtags in |code=)

returns six  values.  Valid parts are returned as themselves; omitted parts are returned as empty strings, invalid
parts are returned as nil; the sixth returned item is an error message (if an error detected) or nil.

see http://www.rfc-editor.org/rfc/bcp/bcp47.txt section 2.1

]]

function p.get_ietf_parts (frame)
	local code;
	local script;
	local region;
	local variant;
	local private;
	
local source = frame.args[1];	-- debug

	local csrv_patterns = {																			-- code, script, region, variant patterns
		'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-(%d%d%d%d)$',												-- cc-Ssss-RR-variant (where variant is 4 digits)
		'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-(%d%d%d%d)$',											-- cc-Ssss-DDD-variant (where region is 3 digits; variant is 4 digits)
		'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',	-- cc-Ssss-RR-variant (where variant is 5-8 alnum characters)
		'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',	-- cc-Ssss-DDD-variant (where region is 3 digits; variant is 5-8 alnum characters)
	}
	
	local crv_patterns = {																			-- code, region, variant patterns
		'^(%a%a%a?)%-(%a%a)%-(%d%d%d%d)$',															-- cc-RR-variant (where variant is 4 digits)
		'^(%a%a%a?)%-(%d%d%d)%-(%d%d%d%d)$',														-- cc-DDD-variant (where region is 3 digits; variant is 4 digits)
		'^(%a%a%a?)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',				-- cc-RR-variant (where variant is 5-8 alnum characters)
		'^(%a%a%a?)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',				-- cc-DDD-variant (where region is 3 digits; variant is 5-8 alnum characters)
	}
	
	local csv_patterns = {																			-- code, script, variant patterns
		'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d%d)$',														-- cc-Ssss-variant (where variant is 4 digits)
		'^(%a%a%a?)%-(%a%a%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',			-- cc-Ssss-variant (where variant is 5-8 alnum characters)
	}
	
	local csr_patterns = {																			-- code, script, region patterns
		'^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)$',															-- cc-Ssss-RR
		'^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)$',														-- cc-Ssss-DDD (where region is 3 digits)
	}
	
	local cv_patterns = {																			-- code, variant patterns
		'^(%a%a%a?)%-(%d%d%d%d)$',																	-- cc-variant (where variant is 4 digits)
		'^(%a%a%a?)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$',						-- cc-variant (where variant is 5-8 alnum characters)
	}
	
	local cr_patterns = {																			-- code, region patterns
		'^(%a%a%a?)%-(%a%a)$',																		-- cc-RR
		'^(%a%a%a?)%-(%d%d%d)$',																	-- cc-DDD (region is 3 digits)
	}

	local cs_patterns = {																			-- code, script patterns
		'^(%a%a%a?)%-(%a%a%a%a)$',																	-- cc-Ssss
	}

	local cp_patterns = {																			-- code, private-use patterns
		'^(%a%a%a?)%-x%-([%a%d][%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?[%a%d]?)$'					-- cc-x-pppppppp where private is 1-8 alnum characters
	}
	
	local c_patterns = {																			-- code, private-use patterns
		'^(%a%a%a?)$',																				-- cc
	}

--	if not is_set (source) then
--		return nil, nil, nil, nil, nil, 'missing language tag';	
--	end

	local _, element_count = source:gsub ('%-', '')

	if 0 == element_count then
		code = source:match (c_patterns[1]);									-- done this way because there is only one pattern
	elseif 1 == element_count then
		while true do															-- spoof a loop so that we can use break to escape when a match is made
		code, script = source:match (cs_patterns[1]);							-- done this way because there is only one pattern
		if code then break; end
		
		code, region = parse_ietf (source, cr_patterns);
		if code then break; end

		code, variant = parse_ietf (source, cv_patterns);
		break;																	-- terminal exit from loop spoof
		end
	elseif 2 == element_count then
		while true do															-- spoof a loop so that we can use break to escape when a match is made
		code, script, region = parse_ietf (source, csr_patterns);
		if code then break; end
		
		code, script, variant = parse_ietf (source, csv_patterns);
		if code then break; end

		code, region, variant = parse_ietf (source, crv_patterns);
		if code then break; end

		code, private = source:match (cp_patterns[1]);							-- the -x- counts as an element but we don't return that; done this way because there is only one pattern
		break;																	-- terminal exit from loop spoof
		end
	elseif 3 == element_count then
		code, script, region, variant = parse_ietf (source, csrv_patterns);
	else
		return table.concat ({'unrecognized language tag: ', source});		-- debug return
--		return nil, nil, nil, nil, nil, table.concat ({'unrecognized language tag: ', source});		-- don't know what we got but it is malformed - too many hyphens
	end
	
	if not code then
		return nil, nil, nil, nil, nil, table.concat ({'unrecognized language tag: ', source});		-- don't know what we got but it is malformed
	end
--	return code, script, region, variant, private;	-- debug return
	return table.concat ({code, script or '', region or '', variant or '', private or ''}, ', '); --debug return

end


return p;