Module:Text: Difference between revisions
From the Croc Wiki, the Croc encyclopedia
Jump to navigationJump to search
Content added Content deleted
de>PerfektesChaos (Setup) |
m (1 revision imported from wikipedia:Module:Text) |
||
(45 intermediate revisions by 12 users not shown) | |||
Line 1: | Line 1: | ||
local yesNo = require("Module:Yesno") |
|||
--[=[ 2013-11-05 |
|||
local Text = { serial = "2022-07-21", |
|||
suite = "Text" } |
|||
--[=[ |
|||
Text utilities |
Text utilities |
||
]=] |
]=] |
||
Line 5: | Line 8: | ||
local |
-- local globals |
||
local |
local PatternCJK = false |
||
local |
local PatternCombined = false |
||
local |
local PatternLatin = false |
||
local PatternTerminated = false |
|||
local QuoteLang = false |
|||
local QuoteType = false |
|||
local RangesLatin = false |
|||
local SeekQuote = false |
|||
local function initLatinData() |
|||
if not RangesLatin then |
|||
RangesLatin = { { 7, 687 }, |
|||
{ 7531, 7578 }, |
|||
{ 7680, 7935 }, |
|||
{ 8194, 8250 } } |
|||
end |
|||
if not PatternLatin then |
|||
local range |
|||
PatternLatin = "^[" |
|||
for i = 1, #RangesLatin do |
|||
range = RangesLatin[ i ] |
|||
PatternLatin = PatternLatin .. |
|||
mw.ustring.char( range[ 1 ], 45, range[ 2 ] ) |
|||
end -- for i |
|||
PatternLatin = PatternLatin .. "]*$" |
|||
end |
|||
end |
|||
local function initQuoteData() |
|||
-- Create quote definitions |
|||
if not QuoteLang then |
|||
QuoteLang = |
|||
{ af = "bd", |
|||
ar = "la", |
|||
be = "labd", |
|||
bg = "bd", |
|||
ca = "la", |
|||
cs = "bd", |
|||
da = "bd", |
|||
de = "bd", |
|||
dsb = "bd", |
|||
et = "bd", |
|||
el = "lald", |
|||
en = "ld", |
|||
es = "la", |
|||
eu = "la", |
|||
-- fa = "la", |
|||
fi = "rd", |
|||
fr = "laSPC", |
|||
ga = "ld", |
|||
he = "ldla", |
|||
hr = "bd", |
|||
hsb = "bd", |
|||
hu = "bd", |
|||
hy = "labd", |
|||
id = "rd", |
|||
is = "bd", |
|||
it = "ld", |
|||
ja = "x300C", |
|||
ka = "bd", |
|||
ko = "ld", |
|||
lt = "bd", |
|||
lv = "bd", |
|||
nl = "ld", |
|||
nn = "la", |
|||
no = "la", |
|||
pl = "bdla", |
|||
pt = "lald", |
|||
ro = "bdla", |
|||
ru = "labd", |
|||
sk = "bd", |
|||
sl = "bd", |
|||
sq = "la", |
|||
sr = "bx", |
|||
sv = "rd", |
|||
th = "ld", |
|||
tr = "ld", |
|||
uk = "la", |
|||
zh = "ld", |
|||
["de-ch"] = "la", |
|||
["en-gb"] = "lsld", |
|||
["en-us"] = "ld", |
|||
["fr-ch"] = "la", |
|||
["it-ch"] = "la", |
|||
["pt-br"] = "ldla", |
|||
["zh-tw"] = "x300C", |
|||
["zh-cn"] = "ld" } |
|||
end |
|||
if not QuoteType then |
|||
QuoteType = |
|||
{ bd = { { 8222, 8220 }, { 8218, 8217 } }, |
|||
bdla = { { 8222, 8220 }, { 171, 187 } }, |
|||
bx = { { 8222, 8221 }, { 8218, 8217 } }, |
|||
la = { { 171, 187 }, { 8249, 8250 } }, |
|||
laSPC = { { 171, 187 }, { 8249, 8250 }, true }, |
|||
labd = { { 171, 187 }, { 8222, 8220 } }, |
|||
lald = { { 171, 187 }, { 8220, 8221 } }, |
|||
ld = { { 8220, 8221 }, { 8216, 8217 } }, |
|||
ldla = { { 8220, 8221 }, { 171, 187 } }, |
|||
lsld = { { 8216, 8217 }, { 8220, 8221 } }, |
|||
rd = { { 8221, 8221 }, { 8217, 8217 } }, |
|||
x300C = { { 0x300C, 0x300D }, |
|||
{ 0x300E, 0x300F } } } |
|||
end |
|||
end -- initQuoteData() |
|||
Text.containsCJK = function ( analyse ) |
|||
local function fiatQuote( apply, alien, advance ) |
|||
-- Quote text |
|||
-- Parameter: |
|||
-- apply -- string, with text |
|||
-- alien -- string, with language code |
|||
-- advance -- number, with level 1 or 2 |
|||
local r = apply and tostring(apply) or "" |
|||
alien = alien or "en" |
|||
advance = tonumber(advance) or 0 |
|||
local suite |
|||
initQuoteData() |
|||
local slang = alien:match( "^(%l+)-" ) |
|||
suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"] |
|||
if suite then |
|||
local quotes = QuoteType[ suite ] |
|||
if quotes then |
|||
local space |
|||
if quotes[ 3 ] then |
|||
space = " " |
|||
else |
|||
space = "" |
|||
end |
|||
quotes = quotes[ advance ] |
|||
if quotes then |
|||
r = mw.ustring.format( "%s%s%s%s%s", |
|||
mw.ustring.char( quotes[ 1 ] ), |
|||
space, |
|||
apply, |
|||
space, |
|||
mw.ustring.char( quotes[ 2 ] ) ) |
|||
end |
|||
else |
|||
mw.log( "fiatQuote() " .. suite ) |
|||
end |
|||
end |
|||
return r |
|||
end -- fiatQuote() |
|||
Text.char = function ( apply, again, accept ) |
|||
-- Create string from codepoints |
|||
-- Parameter: |
|||
-- apply -- table (sequence) with numerical codepoints, or nil |
|||
-- again -- number of repetitions, or nil |
|||
-- accept -- true, if no error messages to be appended |
|||
-- Returns: string |
|||
local r = "" |
|||
apply = type(apply) == "table" and apply or {} |
|||
again = math.floor(tonumber(again) or 1) |
|||
if again < 1 then |
|||
return "" |
|||
end |
|||
local bad = { } |
|||
local codes = { } |
|||
for _, v in ipairs( apply ) do |
|||
local n = tonumber(v) |
|||
if not n or (n < 32 and n ~= 9 and n ~= 10) then |
|||
table.insert(bad, tostring(v)) |
|||
else |
|||
table.insert(codes, math.floor(n)) |
|||
end |
|||
end |
|||
if #bad > 0 then |
|||
if not accept then |
|||
r = tostring( mw.html.create( "span" ) |
|||
:addClass( "error" ) |
|||
:wikitext( "bad codepoints: " .. table.concat( bad, " " )) ) |
|||
end |
|||
return r |
|||
end |
|||
if #codes > 0 then |
|||
r = mw.ustring.char( unpack( codes ) ) |
|||
if again > 1 then |
|||
r = r:rep(again) |
|||
end |
|||
end |
|||
return r |
|||
end -- Text.char() |
|||
local function trimAndFormat(args, fmt) |
|||
local result = {} |
|||
if type(args) ~= 'table' then |
|||
args = {args} |
|||
end |
|||
for _, v in ipairs(args) do |
|||
v = mw.text.trim(tostring(v)) |
|||
if v ~= "" then |
|||
table.insert(result,fmt and mw.ustring.format(fmt, v) or v) |
|||
end |
|||
end |
|||
return result |
|||
end |
|||
Text.concatParams = function ( args, apply, adapt ) |
|||
-- Concat list items into one string |
|||
-- Parameter: |
|||
-- args -- table (sequence) with numKey=string |
|||
-- apply -- string (optional); separator (default: "|") |
|||
-- adapt -- string (optional); format including "%s" |
|||
-- Returns: string |
|||
local collect = { } |
|||
return table.concat(trimAndFormat(args,adapt), apply or "|") |
|||
end -- Text.concatParams() |
|||
Text.containsCJK = function ( s ) |
|||
-- Is any CJK code within? |
-- Is any CJK code within? |
||
-- Parameter: |
-- Parameter: |
||
-- |
-- s -- string |
||
-- Returns: true, if CJK detected |
-- Returns: true, if CJK detected |
||
s = s and tostring(s) or "" |
|||
if not patternCJK then |
if not patternCJK then |
||
patternCJK = mw.ustring.char( 91, |
patternCJK = mw.ustring.char( 91, |
||
4352, 45, 4607, |
|||
11904, 45, 42191, |
|||
43072, 45, 43135, |
|||
44032, 45, 55215, |
|||
63744, 45, 64255, |
|||
65072, 45, 65103, |
|||
65381, 45, 65500, |
|||
131072, 45, 196607, |
|||
93 ) |
93 ) |
||
end |
end |
||
return mw.ustring.find( s, patternCJK ) ~= nil |
|||
end -- Text.containsCJK() |
|||
r = true |
|||
Text.removeDelimited = function (s, prefix, suffix) |
|||
-- Remove all text in s delimited by prefix and suffix (inclusive) |
|||
-- Arguments: |
|||
-- s = string to process |
|||
-- prefix = initial delimiter |
|||
-- suffix = ending delimiter |
|||
-- Returns: stripped string |
|||
s = s and tostring(s) or "" |
|||
prefix = prefix and tostring(prefix) or "" |
|||
suffix = suffix and tostring(suffix) or "" |
|||
local prefixLen = mw.ustring.len(prefix) |
|||
local suffixLen = mw.ustring.len(suffix) |
|||
if prefixLen == 0 or suffixLen == 0 then |
|||
return s |
|||
end |
|||
local i = s:find(prefix, 1, true) |
|||
local r = s |
|||
local j |
|||
while i do |
|||
j = r:find(suffix, i + prefixLen) |
|||
if j then |
|||
r = r:sub(1, i - 1)..r:sub(j+suffixLen) |
|||
else |
|||
r = r:sub(1, i - 1) |
|||
end |
|||
i = r:find(prefix, 1, true) |
|||
end |
|||
return r |
|||
end |
|||
Text.getPlain = function ( adjust ) |
|||
-- Remove wikisyntax from string, except templates |
|||
-- Parameter: |
|||
-- adjust -- string |
|||
-- Returns: string |
|||
local r = Text.removeDelimited(adjust,"<!--","-->") |
|||
r = r:gsub( "(</?%l[^>]*>)", "" ) |
|||
:gsub( "'''", "" ) |
|||
:gsub( "''", "" ) |
|||
:gsub( " ", " " ) |
|||
return r |
|||
end -- Text.getPlain() |
|||
Text.isLatinRange = function (s) |
|||
-- Are characters expected to be latin or symbols within latin texts? |
|||
-- Arguments: |
|||
-- s = string to analyze |
|||
-- Returns: true, if valid for latin only |
|||
s = s and tostring(s) or "" --- ensure input is always string |
|||
initLatinData() |
|||
return mw.ustring.match(s, PatternLatin) ~= nil |
|||
end -- Text.isLatinRange() |
|||
Text.isQuote = function ( s ) |
|||
-- Is this character any quotation mark? |
|||
-- Parameter: |
|||
-- s = single character to analyze |
|||
-- Returns: true, if s is quotation mark |
|||
s = s and tostring(s) or "" |
|||
if s == "" then |
|||
return false |
|||
end |
|||
if not SeekQuote then |
|||
SeekQuote = mw.ustring.char( 34, -- " |
|||
39, -- ' |
|||
171, -- laquo |
|||
187, -- raquo |
|||
8216, -- lsquo |
|||
8217, -- rsquo |
|||
8218, -- sbquo |
|||
8220, -- ldquo |
|||
8221, -- rdquo |
|||
8222, -- bdquo |
|||
8249, -- lsaquo |
|||
8250, -- rsaquo |
|||
0x300C, -- CJK |
|||
0x300D, -- CJK |
|||
0x300E, -- CJK |
|||
0x300F ) -- CJK |
|||
end |
|||
return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil |
|||
end -- Text.isQuote() |
|||
Text.listToText = function ( args, adapt ) |
|||
-- Format list items similar to mw.text.listToText() |
|||
-- Parameter: |
|||
-- args -- table (sequence) with numKey=string |
|||
-- adapt -- string (optional); format including "%s" |
|||
-- Returns: string |
|||
return mw.text.listToText(trimAndFormat(args, adapt)) |
|||
end -- Text.listToText() |
|||
Text.quote = function ( apply, alien, advance ) |
|||
-- Quote text |
|||
-- Parameter: |
|||
-- apply -- string, with text |
|||
-- alien -- string, with language code, or nil |
|||
-- advance -- number, with level 1 or 2, or nil |
|||
-- Returns: quoted string |
|||
apply = apply and tostring(apply) or "" |
|||
local mode, slang |
|||
if type( alien ) == "string" then |
|||
slang = mw.text.trim( alien ):lower() |
|||
else |
else |
||
slang = mw.title.getCurrentTitle().pageLanguage |
|||
r = false |
|||
if not slang then |
|||
-- TODO FIXME: Introduction expected 2017-04 |
|||
slang = mw.language.getContentLanguage():getCode() |
|||
end |
|||
end |
|||
if advance == 2 then |
|||
mode = 2 |
|||
else |
|||
mode = 1 |
|||
end |
|||
return fiatQuote( mw.text.trim( apply ), slang, mode ) |
|||
end -- Text.quote() |
|||
Text.quoteUnquoted = function ( apply, alien, advance ) |
|||
-- Quote text, if not yet quoted and not empty |
|||
-- Parameter: |
|||
-- apply -- string, with text |
|||
-- alien -- string, with language code, or nil |
|||
-- advance -- number, with level 1 or 2, or nil |
|||
-- Returns: string; possibly quoted |
|||
local r = mw.text.trim( apply and tostring(apply) or "" ) |
|||
local s = mw.ustring.sub( r, 1, 1 ) |
|||
if s ~= "" and not Text.isQuote( s, advance ) then |
|||
s = mw.ustring.sub( r, -1, 1 ) |
|||
if not Text.isQuote( s ) then |
|||
r = Text.quote( r, alien, advance ) |
|||
end |
|||
end |
end |
||
return r |
return r |
||
end -- Text. |
end -- Text.quoteUnquoted() |
||
Text.removeDiacritics = function ( adjust ) |
|||
-- Remove all diacritics |
|||
-- Parameter: |
|||
-- adjust -- string |
|||
-- Returns: string; all latin letters should be ASCII |
|||
-- or basic greek or cyrillic or symbols etc. |
|||
local cleanup, decomposed |
|||
if not PatternCombined then |
|||
PatternCombined = mw.ustring.char( 91, |
|||
0x0300, 45, 0x036F, |
|||
0x1AB0, 45, 0x1AFF, |
|||
0x1DC0, 45, 0x1DFF, |
|||
0xFE20, 45, 0xFE2F, |
|||
93 ) |
|||
end |
|||
decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" ) |
|||
cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" ) |
|||
return mw.ustring.toNFC( cleanup ) |
|||
end -- Text.removeDiacritics() |
|||
Line 40: | Line 420: | ||
-- Returns: true, if sentence terminated |
-- Returns: true, if sentence terminated |
||
local r |
local r |
||
if not |
if not PatternTerminated then |
||
PatternTerminated = mw.ustring.char( 91, |
|||
12290, |
12290, |
||
65281, |
65281, |
||
Line 48: | Line 428: | ||
.. "!%.%?…][\"'%]‹›«»‘’“”]*$" |
.. "!%.%?…][\"'%]‹›«»‘’“”]*$" |
||
end |
end |
||
if mw.ustring.find( analyse, |
if mw.ustring.find( analyse, PatternTerminated ) then |
||
r = true |
r = true |
||
else |
else |
||
Line 56: | Line 436: | ||
end -- Text.sentenceTerminated() |
end -- Text.sentenceTerminated() |
||
Text.ucfirstAll = function ( adjust) |
|||
-- Capitalize all words |
|||
-- Arguments: |
|||
-- adjust = string to adjust |
|||
-- Returns: string with all first letters in upper case |
|||
adjust = adjust and tostring(adjust) or "" |
|||
local r = mw.text.decode(adjust,true) |
|||
local i = 1 |
|||
local c, j, m |
|||
m = (r ~= adjust) |
|||
r = " "..r |
|||
while i do |
|||
i = mw.ustring.find( r, "%W%l", i ) |
|||
if i then |
|||
j = i + 1 |
|||
c = mw.ustring.upper( mw.ustring.sub( r, j, j ) ) |
|||
r = string.format( "%s%s%s", |
|||
mw.ustring.sub( r, 1, i ), |
|||
c, |
|||
mw.ustring.sub( r, i + 2 ) ) |
|||
i = j |
|||
end |
|||
end -- while i |
|||
r = r:sub( 2 ) |
|||
if m then |
|||
r = mw.text.encode(r) |
|||
end |
|||
return r |
|||
end -- Text.ucfirstAll() |
|||
Line 65: | Line 476: | ||
-- Returns: string with non-latin parts enclosed in <span> |
-- Returns: string with non-latin parts enclosed in <span> |
||
local r |
local r |
||
initLatinData() |
|||
if not patternLatin then |
|||
if mw.ustring.match( adjust, PatternLatin ) then |
|||
7, 45, 591, |
|||
8194, 45, 8250, |
|||
93, 42, 36 ) |
|||
end |
|||
if mw.ustring.match( adjust, patternLatin ) then |
|||
-- latin only, horizontal dashes, quotes |
-- latin only, horizontal dashes, quotes |
||
r = adjust |
r = adjust |
||
Line 80: | Line 486: | ||
local m = false |
local m = false |
||
local n = mw.ustring.len( adjust ) |
local n = mw.ustring.len( adjust ) |
||
local span = "%s%s<span style='font-style:normal'>%s</span>" |
local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>" |
||
local flat = function ( a ) |
local flat = function ( a ) |
||
-- isLatin |
-- isLatin |
||
local range |
|||
for i = 1, #RangesLatin do |
|||
range = RangesLatin[ i ] |
|||
if a >= range[ 1 ] and a <= range[ 2 ] then |
|||
return true |
|||
end |
|||
end -- for i |
|||
end -- flat() |
|||
local focus = function ( a ) |
|||
-- char is not ambivalent |
|||
local r = ( a > 64 ) |
|||
if r then |
|||
r = ( a < 8192 or a > 8212 ) |
|||
else |
|||
r = ( a == 38 or a == 60 ) -- '&' '<' |
|||
end |
|||
return r |
|||
end -- focus() |
|||
local form = function ( a ) |
local form = function ( a ) |
||
return string.format( span, |
return string.format( span, |
||
Line 90: | Line 512: | ||
mw.ustring.sub( adjust, k, j - 1 ), |
mw.ustring.sub( adjust, k, j - 1 ), |
||
mw.ustring.sub( adjust, j, a ) ) |
mw.ustring.sub( adjust, j, a ) ) |
||
end -- form() |
end -- form() |
||
r = "" |
r = "" |
||
for i = 1, n do |
for i = 1, n do |
||
c = mw.ustring.codepoint( adjust, i, i ) |
c = mw.ustring.codepoint( adjust, i, i ) |
||
if |
if focus( c ) then |
||
if flat( c ) then |
if flat( c ) then |
||
if j then |
if j then |
||
Line 105: | Line 527: | ||
end |
end |
||
if j then |
if j then |
||
local nx = i - 1 |
|||
local s = "" |
|||
for ix = nx, 1, -1 do |
|||
c = mw.ustring.sub( adjust, ix, ix ) |
|||
if c == " " or c == "(" then |
|||
nx = nx - 1 |
|||
s = c .. s |
|||
else |
|||
break -- for ix |
|||
end |
|||
end -- for ix |
|||
r = form( nx ) .. s |
|||
j = false |
j = false |
||
k = i |
k = i |
||
Line 122: | Line 555: | ||
m = m + 1 |
m = m + 1 |
||
end |
end |
||
end -- for i |
end -- for i |
||
if j then |
if j and ( not m or m < n ) then |
||
r = form( n ) |
r = form( n ) |
||
else |
else |
||
Line 131: | Line 564: | ||
return r |
return r |
||
end -- Text.uprightNonlatin() |
end -- Text.uprightNonlatin() |
||
Text.test = function ( about ) |
|||
local r |
|||
if about == "quote" then |
|||
initQuoteData() |
|||
r = { } |
|||
r.QuoteLang = QuoteLang |
|||
r.QuoteType = QuoteType |
|||
end |
|||
return r |
|||
end -- Text.test() |
|||
Line 137: | Line 582: | ||
local p = { } |
local p = { } |
||
for _, func in ipairs({'containsCJK','isLatinRange','isQuote','sentenceTerminated'}) do |
|||
function p.containsCJK( frame ) |
|||
p[func] = function (frame) |
|||
return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or "" |
|||
return Text[func]( frame.args[ 1 ] or "" ) and "1" or "" |
|||
end |
|||
end |
end |
||
for _, func in ipairs({'getPlain','removeDiacritics','ucfirstAll','uprightNonlatin'}) do |
|||
function p.sentenceTerminated( frame ) |
|||
p[func] = function (frame) |
|||
return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or "" |
|||
return Text[func]( frame.args[ 1 ] or "" ) |
|||
end |
|||
end |
end |
||
function p. |
function p.char( frame ) |
||
local params = frame:getParent().args |
|||
local story = params[ 1 ] |
|||
local codes, lenient, multiple |
|||
if not story then |
|||
params = frame.args |
|||
story = params[ 1 ] |
|||
end |
|||
if story then |
|||
local items = mw.text.split( mw.text.trim(story), "%s+" ) |
|||
if #items > 0 then |
|||
local j |
|||
lenient = (yesNo(params.errors) == false) |
|||
codes = { } |
|||
multiple = tonumber( params[ "*" ] ) |
|||
for _, v in ipairs( items ) do |
|||
j = tonumber((v:sub( 1, 1 ) == "x" and "0" or "") .. v) |
|||
table.insert( codes, j or v ) |
|||
end |
|||
end |
|||
end |
|||
return Text.char( codes, multiple, lenient ) |
|||
end |
end |
||
function p.concatParams( frame ) |
|||
local args |
|||
local template = frame.args.template |
|||
if type( template ) == "string" then |
|||
template = mw.text.trim( template ) |
|||
template = ( template == "1" ) |
|||
end |
|||
if template then |
|||
args = frame:getParent().args |
|||
else |
|||
args = frame.args |
|||
end |
|||
return Text.concatParams( args, |
|||
frame.args.separator, |
|||
frame.args.format ) |
|||
end |
|||
function p.listToFormat(frame) |
|||
local lists = {} |
|||
local pformat = frame.args["format"] |
|||
local sep = frame.args["sep"] or ";" |
|||
-- Parameter parsen: Listen |
|||
for k, v in pairs(frame.args) do |
|||
local knum = tonumber(k) |
|||
if knum then lists[knum] = v end |
|||
end |
|||
-- Listen splitten |
|||
local maxListLen = 0 |
|||
for i = 1, #lists do |
|||
lists[i] = mw.text.split(lists[i], sep) |
|||
if #lists[i] > maxListLen then maxListLen = #lists[i] end |
|||
end |
|||
-- Ergebnisstring generieren |
|||
local result = "" |
|||
local result_line = "" |
|||
for i = 1, maxListLen do |
|||
result_line = pformat |
|||
for j = 1, #lists do |
|||
result_line = mw.ustring.gsub(result_line, "%%s", lists[j][i], 1) |
|||
end |
|||
result = result .. result_line |
|||
end |
|||
return result |
|||
end |
|||
function p.listToText( frame ) |
|||
local args |
|||
local template = frame.args.template |
|||
if type( template ) == "string" then |
|||
template = mw.text.trim( template ) |
|||
template = ( template == "1" ) |
|||
end |
|||
if template then |
|||
args = frame:getParent().args |
|||
else |
|||
args = frame.args |
|||
end |
|||
return Text.listToText( args, frame.args.format ) |
|||
end |
|||
function p.quote( frame ) |
|||
local slang = frame.args[2] |
|||
if type( slang ) == "string" then |
|||
slang = mw.text.trim( slang ) |
|||
if slang == "" then |
|||
slang = false |
|||
end |
|||
end |
|||
return Text.quote( frame.args[ 1 ] or "", |
|||
slang, |
|||
tonumber( frame.args[3] ) ) |
|||
end |
|||
function p.quoteUnquoted( frame ) |
|||
local slang = frame.args[2] |
|||
if type( slang ) == "string" then |
|||
slang = mw.text.trim( slang ) |
|||
if slang == "" then |
|||
slang = false |
|||
end |
|||
end |
|||
return Text.quoteUnquoted( frame.args[ 1 ] or "", |
|||
slang, |
|||
tonumber( frame.args[3] ) ) |
|||
end |
|||
function p.zip(frame) |
|||
local lists = {} |
|||
local seps = {} |
|||
local defaultsep = frame.args["sep"] or "" |
|||
local innersep = frame.args["isep"] or "" |
|||
local outersep = frame.args["osep"] or "" |
|||
-- Parameter parsen |
|||
for k, v in pairs(frame.args) do |
|||
local knum = tonumber(k) |
|||
if knum then lists[knum] = v else |
|||
if string.sub(k, 1, 3) == "sep" then |
|||
local sepnum = tonumber(string.sub(k, 4)) |
|||
if sepnum then seps[sepnum] = v end |
|||
end |
|||
end |
|||
end |
|||
-- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden |
|||
for i = 1, math.max(#seps, #lists) do |
|||
if not seps[i] then seps[i] = defaultsep end |
|||
end |
|||
-- Listen splitten |
|||
local maxListLen = 0 |
|||
for i = 1, #lists do |
|||
lists[i] = mw.text.split(lists[i], seps[i]) |
|||
if #lists[i] > maxListLen then maxListLen = #lists[i] end |
|||
end |
|||
local result = "" |
|||
for i = 1, maxListLen do |
|||
if i ~= 1 then result = result .. outersep end |
|||
for j = 1, #lists do |
|||
if j ~= 1 then result = result .. innersep end |
|||
result = result .. (lists[j][i] or "") |
|||
end |
|||
end |
|||
return result |
|||
end |
|||
function p.failsafe() |
|||
return Text.serial |
|||
end |
|||
p.Text = function () |
p.Text = function () |
Latest revision as of 17:57, August 8, 2023
Documentation for this module may be created at Module:Text/doc
local yesNo = require("Module:Yesno")
local Text = { serial = "2022-07-21",
suite = "Text" }
--[=[
Text utilities
]=]
-- local globals
local PatternCJK = false
local PatternCombined = false
local PatternLatin = false
local PatternTerminated = false
local QuoteLang = false
local QuoteType = false
local RangesLatin = false
local SeekQuote = false
local function initLatinData()
if not RangesLatin then
RangesLatin = { { 7, 687 },
{ 7531, 7578 },
{ 7680, 7935 },
{ 8194, 8250 } }
end
if not PatternLatin then
local range
PatternLatin = "^["
for i = 1, #RangesLatin do
range = RangesLatin[ i ]
PatternLatin = PatternLatin ..
mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
end -- for i
PatternLatin = PatternLatin .. "]*$"
end
end
local function initQuoteData()
-- Create quote definitions
if not QuoteLang then
QuoteLang =
{ af = "bd",
ar = "la",
be = "labd",
bg = "bd",
ca = "la",
cs = "bd",
da = "bd",
de = "bd",
dsb = "bd",
et = "bd",
el = "lald",
en = "ld",
es = "la",
eu = "la",
-- fa = "la",
fi = "rd",
fr = "laSPC",
ga = "ld",
he = "ldla",
hr = "bd",
hsb = "bd",
hu = "bd",
hy = "labd",
id = "rd",
is = "bd",
it = "ld",
ja = "x300C",
ka = "bd",
ko = "ld",
lt = "bd",
lv = "bd",
nl = "ld",
nn = "la",
no = "la",
pl = "bdla",
pt = "lald",
ro = "bdla",
ru = "labd",
sk = "bd",
sl = "bd",
sq = "la",
sr = "bx",
sv = "rd",
th = "ld",
tr = "ld",
uk = "la",
zh = "ld",
["de-ch"] = "la",
["en-gb"] = "lsld",
["en-us"] = "ld",
["fr-ch"] = "la",
["it-ch"] = "la",
["pt-br"] = "ldla",
["zh-tw"] = "x300C",
["zh-cn"] = "ld" }
end
if not QuoteType then
QuoteType =
{ bd = { { 8222, 8220 }, { 8218, 8217 } },
bdla = { { 8222, 8220 }, { 171, 187 } },
bx = { { 8222, 8221 }, { 8218, 8217 } },
la = { { 171, 187 }, { 8249, 8250 } },
laSPC = { { 171, 187 }, { 8249, 8250 }, true },
labd = { { 171, 187 }, { 8222, 8220 } },
lald = { { 171, 187 }, { 8220, 8221 } },
ld = { { 8220, 8221 }, { 8216, 8217 } },
ldla = { { 8220, 8221 }, { 171, 187 } },
lsld = { { 8216, 8217 }, { 8220, 8221 } },
rd = { { 8221, 8221 }, { 8217, 8217 } },
x300C = { { 0x300C, 0x300D },
{ 0x300E, 0x300F } } }
end
end -- initQuoteData()
local function fiatQuote( apply, alien, advance )
-- Quote text
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code
-- advance -- number, with level 1 or 2
local r = apply and tostring(apply) or ""
alien = alien or "en"
advance = tonumber(advance) or 0
local suite
initQuoteData()
local slang = alien:match( "^(%l+)-" )
suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"]
if suite then
local quotes = QuoteType[ suite ]
if quotes then
local space
if quotes[ 3 ] then
space = " "
else
space = ""
end
quotes = quotes[ advance ]
if quotes then
r = mw.ustring.format( "%s%s%s%s%s",
mw.ustring.char( quotes[ 1 ] ),
space,
apply,
space,
mw.ustring.char( quotes[ 2 ] ) )
end
else
mw.log( "fiatQuote() " .. suite )
end
end
return r
end -- fiatQuote()
Text.char = function ( apply, again, accept )
-- Create string from codepoints
-- Parameter:
-- apply -- table (sequence) with numerical codepoints, or nil
-- again -- number of repetitions, or nil
-- accept -- true, if no error messages to be appended
-- Returns: string
local r = ""
apply = type(apply) == "table" and apply or {}
again = math.floor(tonumber(again) or 1)
if again < 1 then
return ""
end
local bad = { }
local codes = { }
for _, v in ipairs( apply ) do
local n = tonumber(v)
if not n or (n < 32 and n ~= 9 and n ~= 10) then
table.insert(bad, tostring(v))
else
table.insert(codes, math.floor(n))
end
end
if #bad > 0 then
if not accept then
r = tostring( mw.html.create( "span" )
:addClass( "error" )
:wikitext( "bad codepoints: " .. table.concat( bad, " " )) )
end
return r
end
if #codes > 0 then
r = mw.ustring.char( unpack( codes ) )
if again > 1 then
r = r:rep(again)
end
end
return r
end -- Text.char()
local function trimAndFormat(args, fmt)
local result = {}
if type(args) ~= 'table' then
args = {args}
end
for _, v in ipairs(args) do
v = mw.text.trim(tostring(v))
if v ~= "" then
table.insert(result,fmt and mw.ustring.format(fmt, v) or v)
end
end
return result
end
Text.concatParams = function ( args, apply, adapt )
-- Concat list items into one string
-- Parameter:
-- args -- table (sequence) with numKey=string
-- apply -- string (optional); separator (default: "|")
-- adapt -- string (optional); format including "%s"
-- Returns: string
local collect = { }
return table.concat(trimAndFormat(args,adapt), apply or "|")
end -- Text.concatParams()
Text.containsCJK = function ( s )
-- Is any CJK code within?
-- Parameter:
-- s -- string
-- Returns: true, if CJK detected
s = s and tostring(s) or ""
if not patternCJK then
patternCJK = mw.ustring.char( 91,
4352, 45, 4607,
11904, 45, 42191,
43072, 45, 43135,
44032, 45, 55215,
63744, 45, 64255,
65072, 45, 65103,
65381, 45, 65500,
131072, 45, 196607,
93 )
end
return mw.ustring.find( s, patternCJK ) ~= nil
end -- Text.containsCJK()
Text.removeDelimited = function (s, prefix, suffix)
-- Remove all text in s delimited by prefix and suffix (inclusive)
-- Arguments:
-- s = string to process
-- prefix = initial delimiter
-- suffix = ending delimiter
-- Returns: stripped string
s = s and tostring(s) or ""
prefix = prefix and tostring(prefix) or ""
suffix = suffix and tostring(suffix) or ""
local prefixLen = mw.ustring.len(prefix)
local suffixLen = mw.ustring.len(suffix)
if prefixLen == 0 or suffixLen == 0 then
return s
end
local i = s:find(prefix, 1, true)
local r = s
local j
while i do
j = r:find(suffix, i + prefixLen)
if j then
r = r:sub(1, i - 1)..r:sub(j+suffixLen)
else
r = r:sub(1, i - 1)
end
i = r:find(prefix, 1, true)
end
return r
end
Text.getPlain = function ( adjust )
-- Remove wikisyntax from string, except templates
-- Parameter:
-- adjust -- string
-- Returns: string
local r = Text.removeDelimited(adjust,"<!--","-->")
r = r:gsub( "(</?%l[^>]*>)", "" )
:gsub( "'''", "" )
:gsub( "''", "" )
:gsub( " ", " " )
return r
end -- Text.getPlain()
Text.isLatinRange = function (s)
-- Are characters expected to be latin or symbols within latin texts?
-- Arguments:
-- s = string to analyze
-- Returns: true, if valid for latin only
s = s and tostring(s) or "" --- ensure input is always string
initLatinData()
return mw.ustring.match(s, PatternLatin) ~= nil
end -- Text.isLatinRange()
Text.isQuote = function ( s )
-- Is this character any quotation mark?
-- Parameter:
-- s = single character to analyze
-- Returns: true, if s is quotation mark
s = s and tostring(s) or ""
if s == "" then
return false
end
if not SeekQuote then
SeekQuote = mw.ustring.char( 34, -- "
39, -- '
171, -- laquo
187, -- raquo
8216, -- lsquo
8217, -- rsquo
8218, -- sbquo
8220, -- ldquo
8221, -- rdquo
8222, -- bdquo
8249, -- lsaquo
8250, -- rsaquo
0x300C, -- CJK
0x300D, -- CJK
0x300E, -- CJK
0x300F ) -- CJK
end
return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil
end -- Text.isQuote()
Text.listToText = function ( args, adapt )
-- Format list items similar to mw.text.listToText()
-- Parameter:
-- args -- table (sequence) with numKey=string
-- adapt -- string (optional); format including "%s"
-- Returns: string
return mw.text.listToText(trimAndFormat(args, adapt))
end -- Text.listToText()
Text.quote = function ( apply, alien, advance )
-- Quote text
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code, or nil
-- advance -- number, with level 1 or 2, or nil
-- Returns: quoted string
apply = apply and tostring(apply) or ""
local mode, slang
if type( alien ) == "string" then
slang = mw.text.trim( alien ):lower()
else
slang = mw.title.getCurrentTitle().pageLanguage
if not slang then
-- TODO FIXME: Introduction expected 2017-04
slang = mw.language.getContentLanguage():getCode()
end
end
if advance == 2 then
mode = 2
else
mode = 1
end
return fiatQuote( mw.text.trim( apply ), slang, mode )
end -- Text.quote()
Text.quoteUnquoted = function ( apply, alien, advance )
-- Quote text, if not yet quoted and not empty
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code, or nil
-- advance -- number, with level 1 or 2, or nil
-- Returns: string; possibly quoted
local r = mw.text.trim( apply and tostring(apply) or "" )
local s = mw.ustring.sub( r, 1, 1 )
if s ~= "" and not Text.isQuote( s, advance ) then
s = mw.ustring.sub( r, -1, 1 )
if not Text.isQuote( s ) then
r = Text.quote( r, alien, advance )
end
end
return r
end -- Text.quoteUnquoted()
Text.removeDiacritics = function ( adjust )
-- Remove all diacritics
-- Parameter:
-- adjust -- string
-- Returns: string; all latin letters should be ASCII
-- or basic greek or cyrillic or symbols etc.
local cleanup, decomposed
if not PatternCombined then
PatternCombined = mw.ustring.char( 91,
0x0300, 45, 0x036F,
0x1AB0, 45, 0x1AFF,
0x1DC0, 45, 0x1DFF,
0xFE20, 45, 0xFE2F,
93 )
end
decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" )
cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" )
return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()
Text.sentenceTerminated = function ( analyse )
-- Is string terminated by dot, question or exclamation mark?
-- Quotation, link termination and so on granted
-- Parameter:
-- analyse -- string
-- Returns: true, if sentence terminated
local r
if not PatternTerminated then
PatternTerminated = mw.ustring.char( 91,
12290,
65281,
65294,
65311 )
.. "!%.%?…][\"'%]‹›«»‘’“”]*$"
end
if mw.ustring.find( analyse, PatternTerminated ) then
r = true
else
r = false
end
return r
end -- Text.sentenceTerminated()
Text.ucfirstAll = function ( adjust)
-- Capitalize all words
-- Arguments:
-- adjust = string to adjust
-- Returns: string with all first letters in upper case
adjust = adjust and tostring(adjust) or ""
local r = mw.text.decode(adjust,true)
local i = 1
local c, j, m
m = (r ~= adjust)
r = " "..r
while i do
i = mw.ustring.find( r, "%W%l", i )
if i then
j = i + 1
c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
r = string.format( "%s%s%s",
mw.ustring.sub( r, 1, i ),
c,
mw.ustring.sub( r, i + 2 ) )
i = j
end
end -- while i
r = r:sub( 2 )
if m then
r = mw.text.encode(r)
end
return r
end -- Text.ucfirstAll()
Text.uprightNonlatin = function ( adjust )
-- Ensure non-italics for non-latin text parts
-- One single greek letter might be granted
-- Precondition:
-- adjust -- string
-- Returns: string with non-latin parts enclosed in <span>
local r
initLatinData()
if mw.ustring.match( adjust, PatternLatin ) then
-- latin only, horizontal dashes, quotes
r = adjust
else
local c
local j = false
local k = 1
local m = false
local n = mw.ustring.len( adjust )
local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
local flat = function ( a )
-- isLatin
local range
for i = 1, #RangesLatin do
range = RangesLatin[ i ]
if a >= range[ 1 ] and a <= range[ 2 ] then
return true
end
end -- for i
end -- flat()
local focus = function ( a )
-- char is not ambivalent
local r = ( a > 64 )
if r then
r = ( a < 8192 or a > 8212 )
else
r = ( a == 38 or a == 60 ) -- '&' '<'
end
return r
end -- focus()
local form = function ( a )
return string.format( span,
r,
mw.ustring.sub( adjust, k, j - 1 ),
mw.ustring.sub( adjust, j, a ) )
end -- form()
r = ""
for i = 1, n do
c = mw.ustring.codepoint( adjust, i, i )
if focus( c ) then
if flat( c ) then
if j then
if m then
if i == m then
-- single greek letter.
j = false
end
m = false
end
if j then
local nx = i - 1
local s = ""
for ix = nx, 1, -1 do
c = mw.ustring.sub( adjust, ix, ix )
if c == " " or c == "(" then
nx = nx - 1
s = c .. s
else
break -- for ix
end
end -- for ix
r = form( nx ) .. s
j = false
k = i
end
end
elseif not j then
j = i
if c >= 880 and c <= 1023 then
-- single greek letter?
m = i + 1
else
m = false
end
end
elseif m then
m = m + 1
end
end -- for i
if j and ( not m or m < n ) then
r = form( n )
else
r = r .. mw.ustring.sub( adjust, k )
end
end
return r
end -- Text.uprightNonlatin()
Text.test = function ( about )
local r
if about == "quote" then
initQuoteData()
r = { }
r.QuoteLang = QuoteLang
r.QuoteType = QuoteType
end
return r
end -- Text.test()
-- Export
local p = { }
for _, func in ipairs({'containsCJK','isLatinRange','isQuote','sentenceTerminated'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" ) and "1" or ""
end
end
for _, func in ipairs({'getPlain','removeDiacritics','ucfirstAll','uprightNonlatin'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" )
end
end
function p.char( frame )
local params = frame:getParent().args
local story = params[ 1 ]
local codes, lenient, multiple
if not story then
params = frame.args
story = params[ 1 ]
end
if story then
local items = mw.text.split( mw.text.trim(story), "%s+" )
if #items > 0 then
local j
lenient = (yesNo(params.errors) == false)
codes = { }
multiple = tonumber( params[ "*" ] )
for _, v in ipairs( items ) do
j = tonumber((v:sub( 1, 1 ) == "x" and "0" or "") .. v)
table.insert( codes, j or v )
end
end
end
return Text.char( codes, multiple, lenient )
end
function p.concatParams( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.concatParams( args,
frame.args.separator,
frame.args.format )
end
function p.listToFormat(frame)
local lists = {}
local pformat = frame.args["format"]
local sep = frame.args["sep"] or ";"
-- Parameter parsen: Listen
for k, v in pairs(frame.args) do
local knum = tonumber(k)
if knum then lists[knum] = v end
end
-- Listen splitten
local maxListLen = 0
for i = 1, #lists do
lists[i] = mw.text.split(lists[i], sep)
if #lists[i] > maxListLen then maxListLen = #lists[i] end
end
-- Ergebnisstring generieren
local result = ""
local result_line = ""
for i = 1, maxListLen do
result_line = pformat
for j = 1, #lists do
result_line = mw.ustring.gsub(result_line, "%%s", lists[j][i], 1)
end
result = result .. result_line
end
return result
end
function p.listToText( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.listToText( args, frame.args.format )
end
function p.quote( frame )
local slang = frame.args[2]
if type( slang ) == "string" then
slang = mw.text.trim( slang )
if slang == "" then
slang = false
end
end
return Text.quote( frame.args[ 1 ] or "",
slang,
tonumber( frame.args[3] ) )
end
function p.quoteUnquoted( frame )
local slang = frame.args[2]
if type( slang ) == "string" then
slang = mw.text.trim( slang )
if slang == "" then
slang = false
end
end
return Text.quoteUnquoted( frame.args[ 1 ] or "",
slang,
tonumber( frame.args[3] ) )
end
function p.zip(frame)
local lists = {}
local seps = {}
local defaultsep = frame.args["sep"] or ""
local innersep = frame.args["isep"] or ""
local outersep = frame.args["osep"] or ""
-- Parameter parsen
for k, v in pairs(frame.args) do
local knum = tonumber(k)
if knum then lists[knum] = v else
if string.sub(k, 1, 3) == "sep" then
local sepnum = tonumber(string.sub(k, 4))
if sepnum then seps[sepnum] = v end
end
end
end
-- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
for i = 1, math.max(#seps, #lists) do
if not seps[i] then seps[i] = defaultsep end
end
-- Listen splitten
local maxListLen = 0
for i = 1, #lists do
lists[i] = mw.text.split(lists[i], seps[i])
if #lists[i] > maxListLen then maxListLen = #lists[i] end
end
local result = ""
for i = 1, maxListLen do
if i ~= 1 then result = result .. outersep end
for j = 1, #lists do
if j ~= 1 then result = result .. innersep end
result = result .. (lists[j][i] or "")
end
end
return result
end
function p.failsafe()
return Text.serial
end
p.Text = function ()
return Text
end -- p.Text
return p