Module:Unicode convert

Ready for use

This module is rated as ready for general use. It has reached a mature form and is thought to be relatively bug-free and ready for use wherever appropriate. It is ready to mention on help pages and other Wikipedia resources as an option for new users to learn. To reduce server load and bad output, it should be improved by sandbox testing rather than repeated trial-and-error editing.

Usage edit source

Converts Unicode character codes, always given in hexadecimal, to their UTF-8 or UTF-16 representation in upper-case hex or decimal. Can also reverse this for UTF-8. The UTF-16 form will accept and pass through unpaired surrogates e.g. {{#invoke:Unicode convert|getUTF8|D835}} → D835. The reverse function fromUTF8 accepts multiple characters, and can have both input and output set to decimal.

When using from another module, you may call these functions as e.g. unicodeConvert.getUTF8{ args = {'1F345'} }, without a proper frame object.

To find the character code of a given symbol (in decimal), use e.g. Template:Ml → 128049.

Code	Output
Template:Mlx	F0 9F 8D 85
Template:Mlx	240 159 141 133
Template:Mlx	1F345
Template:Mlx	127813
Template:Mlx	D83C DF45
Template:Mlx	55356 57157

See also edit source

Template:Unicode templates

local p = {}

-- NOTE: all these functions use frame solely for its args member.
-- Modules using them may therefore call them with a fake frame table
-- containing only args.

p.getUTF8 = function (frame)
	local ch = mw.ustring.char(tonumber(frame.args[1] or '0', 16) or 0)
	local bytes = {mw.ustring.byte(ch, 1, -1)}
	local format = ({
		['10'] = '%d',
		dec = '%d'
	})[frame.args['base']] or '%02X'
	for i = 1, #bytes do
		bytes[i] = format:format(bytes[i])
	end
	return table.concat(bytes, ' ')
end

p.getUTF16 = function (frame)
	local codepoint = tonumber(frame.args[1] or '0', 16) or 0
	local format = ({ -- TODO reduce the number of options.
		['10'] = '%d',
		dec = '%d'
	})[frame.args['base']] or '%04X'
	if codepoint <= 0xFFFF then -- NB this also returns lone surrogate characters
		return format:format(codepoint)
	elseif codepoint > 0x10FFFF then -- There are no codepoints above this
		return ''
	end
	codepoint = codepoint - 0x10000
	bit32 = require('bit32')
	return (format .. ' ' .. format):format(
		bit32.rshift(codepoint, 10) + 0xD800,
		bit32.band(codepoint, 0x3FF) + 0xDC00)
end

p.fromUTF8 = function(frame)
	local basein = frame.args['basein'] == 'dec' and 10 or 16
	local format = frame.args['base'] == 'dec' and '%d ' or '%02X '
	local bytes = {}
	for byte in mw.text.gsplit(frame.args[1], '%s') do
		table.insert(bytes, tonumber(byte, basein))
	end
	local chars = {mw.ustring.codepoint(string.char(unpack(bytes)), 1, -1)}
	return format:rep(#chars):sub(1, -2):format(unpack(chars))
end

return p