Jump to content

Module:Unicode data/category

Kuva Wikipedia

Unicode General Category data derived from DerivedGeneralCategory.txt in the Unicode Character Database.

The data was generated by the two Lua 5.3 scripts below. LPeg is required. If the two scripts are in the same folder as DerivedGeneralCategory.txt, all one has to do is go to the directory with the command line and type lua print_data.lua to print the data to data.lua.

Lua 5.3 scripts
parse_data.lua
local f = assert(io.open 'DerivedGeneralCategory.txt', 'r') local Derived_General_Category = f:read 'a' f:close() local lpeg = require 'lpeg' for k, v in pairs(lpeg) do if type(k) == 'string' then local first_letter = k:sub(1, 1) if first_letter == first_letter:upper() then _ENV[k] = v end end end local General_Category_data = { singles = {}, ranges = {} } local function process_match(str, pos, ...) if select(3, ...) then -- three arguments: XXXX..XXXX ; gc local low, high, category = ... if category ~= 'Cn' then low, high = tonumber(low, 16), tonumber(high, 16) table.insert(General_Category_data.ranges, { low, high, category }) end else -- two arguments: XXXX ; gc local codepoint, category = ... if category ~= 'Cn' then codepoint = tonumber(codepoint, 16) General_Category_data.singles[codepoint] = category end end return pos end local patt = P { (V 'line' + 1)^1, line = Cmt((V 'range' + C(V 'codepoint')) * V 'white' * P ';' * V 'white' * C(V 'gc') * (1 - V 'nl')^0, process_match), range = C(V 'codepoint') * P '..' * C(V 'codepoint'), codepoint = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-2, gc = R 'AZ' * P(1), hex = R("09", "AF"), white = S ' \t'^0, nl = P '\r'^-1 * P '\n', } patt:match(Derived_General_Category) return General_Category_data 
print_data.lua
local data_filename = [[make_data.lua]] local data = dofile(data_filename) local output_filename = [[data.lua]] local output = assert(io.open(output_filename, 'w')) local function writef(...) output:write(string.format(...)) end writef [[ return { singles = { ]] -- Check that maximum "singles" codepoint is less than 0x100000? for codepoint, category in require 't'.spairs(data.singles) do writef('\t\t [0x%05X] = "%s",\n', codepoint, category) end writef [[ }, ranges = { ]] local function compare_ranges(range1, range2) return range1[1] < range2[1] end table.sort(data.ranges, compare_ranges) for _, range in ipairs(data.ranges) do writef('\t\t{ 0x%06X, 0x%06X, "%s" },\n', table.unpack(range)) end writef [[ }, }]] 



-- [[:commons:Data:Unicode/data/category/singles.tab]] -- [[:commons:Data:Unicode/data/category/ranges.tab]] -- [[:commons:Data:Unicode/data/category/names.tab]]  local function get_result() local write_index local result={}  -- singles result.singles={} local data=mw.ext.data.get("Unicode/data/category/singles.tab") for index, cols in ipairs(data.data) do if cols[1] and cols[2] then if cols[2] ~= "Cn" then result.singles[tonumber(cols[1], 16)]=cols[2] end end end  -- ranges data=mw.ext.data.get("Unicode/data/category/ranges.tab") write_index=1 result.ranges={} for index, cols in ipairs(data.data) do if cols[1] and cols[2] and cols[3] then if cols[3] ~= "Cn" then result.ranges[write_index]={tonumber(cols[1], 16), tonumber(cols[2], 16), cols[3]} write_index=write_index+1 end end end  -- long_names data=mw.ext.data.get("Unicode/data/category/names.tab") result.long_names={} for index, cols in ipairs(data.data) do if cols[1] and cols[2] then result.long_names[cols[1]]=cols[2] end end  return result end  return get_result()