Module:Delink: Difference between revisions

From the Croc Wiki, the Croc encyclopedia
Jump to navigationJump to search
Content added Content deleted
(since mw.ustring is 30x slower than string, only use it when necessary)
m (57 revisions imported from wikipedia:Module:Delink)
 
(6 intermediate revisions by 5 users not shown)
Line 1: Line 1:
-- This module de-links most wikitext.
-- This module de-links most wikitext.

require("Module:No globals")


local p = {}
local p = {}

local getArgs


local function delinkReversePipeTrick(s)
local function delinkReversePipeTrick(s)
if s:match("^%[%[|.*[|\n]") then -- Check for newlines or multiple pipes.
if s:match("^%[%[|.*[|\n]") then -- Check for newlines or multiple pipes.
return s
return s
end
else
return s:match("%[%[|(.*)%]%]")
return s:match("%[%[|(.*)%]%]")

end
end
end


local function delinkPipeTrick(s)
local function delinkPipeTrick(s)
-- We need to deal with colons, brackets, and commas, per [[Help:Pipe trick]].
local linkarea, display = "", ""
-- First, remove the text before the first colon, if any.
-- We need to deal with colons, brackets, and commas, per [[Help:Pipe trick]].
if s:match(":") then
s = s:match("%[%[.-:(.*)|%]%]")
-- First, remove the text before the first colon, if any.
-- If there are no colons, grab all of the text apart from the square brackets and the pipe.
if s:match(":") then
else
s = s:match("%[%[.-:(.*)|%]%]")
s = s:match("%[%[(.*)|%]%]")
-- If there are no colons, grab all of the text apart from the square brackets and the pipe.
end
else
s = s:match("%[%[(.*)|%]%]")
-- Next up, brackets and commas.
end
if s:match("%(.-%)$") then -- Brackets trump commas.
s = s:match("(.-) ?%(.-%)$")
-- Next up, brackets and commas.
if s:match("%(.-%)$") then -- Brackets trump commas.
elseif s:match(",") then -- If there are no brackets, display only the text before the first comma.
s = s:match("(.-) ?%(.-%)$")
s = s:match("(.-),.*$")
end
elseif s:match(",") then -- If there are no brackets, display only the text before the first comma.
return s
s = s:match("(.-),.*$")
end
return s
end
end


-- Return wikilink target |wikilinks=target
local function delinkWikilink(s)
local result = s
local function getDelinkedTarget(s)
local result = s
-- Deal with the reverse pipe trick.
-- Deal with the reverse pipe trick.
if result:match("%[%[|") then
if result:match("%[%[|") then
return delinkReversePipeTrick(result)
return delinkReversePipeTrick(result)
end
end
result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
result = mw.text.decode(result, true) -- decode HTML entities.
result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
result = mw.text.decode(result, true) -- decode HTML entities.
-- Check for bad titles. To do this we need to find the
-- Check for bad titles. To do this we need to find the
-- title area of the link, i.e. the part before any pipes.
-- title area of the link, i.e. the part before any pipes.
local titlearea
local target_area
if result:match("|") then -- Find if we're dealing with a piped link.
if result:match("|") then -- Find if we're dealing with a piped link.
titlearea = result:match("^%[%[(.-)|.*%]%]")
target_area = result:match("^%[%[(.-)|.*%]%]")
else
else
titlearea = result:match("^%[%[(.-)%]%]")
target_area = result:match("^%[%[(.-)%]%]")
end
end
-- Check for bad characters.
if mw.ustring.match(titlearea, "[%[%]<>{}%%%c\n]") then
return s
end
-- Check for categories, interwikis, and files.
local colonprefix = result:match("%[%[(.-):.*%]%]") or "" -- Get the text before the first colon.
local ns = mw.site.namespaces[colonprefix] -- see if this is a known namespace
if mw.language.isKnownLanguageTag(colonprefix)
or ( ns and ( ns.canonicalName == "File" or ns.canonicalName == "Category" ) ) then
return ""
end
-- Remove the colon if the link is using the [[Help:Colon trick]].
if result:match("%[%[:") then
result = "[[" .. result:match("%[%[:(.*%]%])")
end
-- Deal with links using the [[Help:Pipe trick]].
if mw.ustring.match(result, "^%[%[[^|]*|%]%]") then
return delinkPipeTrick(result)
end
-- Find the display area of the wikilink
if result:match("|") then -- Find if we're dealing with a piped link.
result = result:match("^%[%[.-|(.+)%]%]")
-- Remove new lines from the display of multiline piped links,
-- where the pipe is before the first new line.
result = result:gsub("\n", "")
else
result = result:match("^%[%[(.-)%]%]")
end


-- Check for bad characters.
return result
if mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
return s
end
return target_area
end

local function getDelinkedLabel(s)
local result = s
-- Deal with the reverse pipe trick.
if result:match("%[%[|") then
return delinkReversePipeTrick(result)
end

result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
result = mw.text.decode(result, true) -- decode HTML entities.

-- Check for bad titles. To do this we need to find the
-- title area of the link, i.e. the part before any pipes.
local target_area
if result:match("|") then -- Find if we're dealing with a piped link.
target_area = result:match("^%[%[(.-)|.*%]%]")
else
target_area = result:match("^%[%[(.-)%]%]")
end

-- Check for bad characters.
if mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
return s
end

-- Check for categories, interwikis, and files.
local colon_prefix = result:match("%[%[(.-):.*%]%]") or "" -- Get the text before the first colon.
local ns = mw.site.namespaces[colon_prefix] -- see if this is a known namespace
if mw.language.isKnownLanguageTag(colon_prefix) or (ns and (ns.canonicalName == "File" or ns.canonicalName == "Category")) then
return ""
end

-- Remove the colon if the link is using the [[Help:Colon trick]].
if result:match("%[%[:") then
result = "[[" .. result:match("%[%[:(.*%]%])")
end

-- Deal with links using the [[Help:Pipe trick]].
if mw.ustring.match(result, "^%[%[[^|]*|%]%]") then
return delinkPipeTrick(result)
end
-- Find the display area of the wikilink
if result:match("|") then -- Find if we're dealing with a piped link.
result = result:match("^%[%[.-|(.+)%]%]")
-- Remove new lines from the display of multiline piped links,
-- where the pipe is before the first new line.
result = result:gsub("\n", "")
else
result = result:match("^%[%[(.-)%]%]")
end

return result
end
end


local function delinkURL(s)
local function delinkURL(s)
-- Assume we have already delinked internal wikilinks, and that
-- Assume we have already delinked internal wikilinks, and that
-- we have been passed some text between two square brackets [foo].
-- we have been passed some text between two square brackets [foo].
-- If the text contains a line break it is not formatted as a URL, regardless of other content.
-- If the text contains a line break it is not formatted as a URL, regardless of other content.
if s:match("\n") then
if s:match("\n") then
return s
return s
end
end
-- Check if the text has a valid URL prefix and at least one valid URL character.
-- Check if the text has a valid URL prefix and at least one valid URL character.
local valid_url_prefixes = {"//", "http://", "https://", "ftp://", "gopher://", "mailto:", "news:", "irc://"}
local valid_url_prefixes = {"//", "http://", "https://", "ftp://", "gopher://", "mailto:", "news:", "irc://"}
local url_prefix
local url_prefix
for i,v in ipairs(valid_url_prefixes) do
for _ ,v in ipairs(valid_url_prefixes) do
if mw.ustring.match(s, '^%[' .. v ..'[^"%s].*%]' ) then
if mw.ustring.match(s, '^%[' .. v ..'[^"%s].*%]' ) then
url_prefix = v
url_prefix = v
break
break
end
end
end
end
-- Get display text
-- Get display text
if not url_prefix then
if not url_prefix then
return s
return s
end
end
s = s:match("^%[" .. url_prefix .. "(.*)%]") -- Grab all of the text after the URL prefix and before the final square bracket.
s = s:match("^%[" .. url_prefix .. "(.*)%]") -- Grab all of the text after the URL prefix and before the final square bracket.
s = s:match('^.-(["<> ].*)') or "" -- Grab all of the text after the first URL separator character ("<> ).
s = s:match('^.-(["<> ].*)') or "" -- Grab all of the text after the first URL separator character ("<> ).
s = mw.ustring.match(s, "^%s*(%S.*)$") or "" -- If the separating character was a space, trim it off.
s = mw.ustring.match(s, "^%s*(%S.*)$") or "" -- If the separating character was a space, trim it off.
s_decoded = mw.text.decode(s, true)
local s_decoded = mw.text.decode(s, true)
if mw.ustring.match(s_decoded, "%c") then
if mw.ustring.match(s_decoded, "%c") then
return s
return s
end
else

return s_decoded
return s_decoded
end

end
end


local function delinkLinkClass(s, pattern, delinkFunction)
local function delinkLinkClass(text, pattern, delinkFunction)
if not type(s) == "string" then
if type(text) ~= "string" then
error("Attempt to de-link non-string input.", 2)
error("Attempt to de-link non-string input.", 2)
end
end
if not ( type(pattern) == "string" and mw.ustring.sub(pattern, 1, 1) == "^" ) then
if type(pattern) ~= "string" or mw.ustring.sub(pattern, 1, 1) ~= "^" then
error('Invalid pattern detected. Patterns must begin with "^".', 2)
error('Invalid pattern detected. Patterns must begin with "^".', 2)
end
end
-- Iterate over the text string, and replace any matched text. using the
-- Iterate over the text string, and replace any matched text. using the
-- delink function. We need to iterate character by character rather
-- delink function. We need to iterate character by character rather
-- than just use gsub, otherwise nested links aren't detected properly.
-- than just use gsub, otherwise nested links aren't detected properly.
local result = ""
local result = ""
while s ~= '' do
while text ~= "" do
-- Replace text using one iteration of gsub.
-- Replace text using one iteration of gsub.
s = mw.ustring.gsub(s, pattern, delinkFunction, 1)
text = mw.ustring.gsub(text, pattern, delinkFunction, 1)
-- Append the left-most character to the result string.
-- Append the left-most character to the result string.
result = result .. mw.ustring.sub(s, 1, 1)
result = result .. mw.ustring.sub(text, 1, 1)
s = mw.ustring.sub(s, 2, -1)
text = mw.ustring.sub(text, 2, -1)
end
end
return result
return result
end
end


function p._delink(args)
function p._delink(args)
local text = args[1] or ""
local text = args[1] or ""
if args.refs == "yes" then
if args.refs == "yes" then
-- Remove any [[Help:Strip markers]] representing ref tags. In most situations
-- Remove any [[Help:Strip markers]] representing ref tags. In most situations
-- this is not a good idea - only use it if you know what you are doing!
-- this is not a good idea - only use it if you know what you are doing!
text = mw.ustring.gsub(text, "UNIQ%w*%-ref%-%d*%-QINU", "")
text = mw.ustring.gsub(text, "UNIQ%w*%-ref%-%d*%-QINU", "")
end
end
if not (args.comments == "no") then
if args.comments ~= "no" then
text = text:gsub("<!%-%-.-%-%->", "") -- Remove html comments.
text = text:gsub("<!%-%-.-%-%->", "") -- Remove html comments.
end
end

if not (args.wikilinks == "no") then
if args.wikilinks ~= "no" and args.wikilinks ~= "target" then
text = delinkLinkClass(text, "^%[%[.-%]%]", delinkWikilink) -- De-link wikilinks.
-- De-link wikilinks and return the label portion of the wikilink.
end
text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedLabel)
if not (args.urls == "no") then
elseif args.wikilinks == "target" then
text = delinkLinkClass(text, "^%[.-%]", delinkURL) -- De-link URLs.
-- De-link wikilinks and return the target portions of the wikilink.
end
text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedTarget)
if not (args.whitespace == "no") then
end
-- Replace single new lines with a single space, but leave double new lines
if args.urls ~= "no" then
-- and new lines only containing spaces or tabs before a second new line.
text = mw.ustring.gsub(text, "([^\n \t][ \t]*)\n([ \t]*[^\n \t])", "%1 %2")
text = delinkLinkClass(text, "^%[.-%]", delinkURL) -- De-link URLs.
end
text = text:gsub("[ \t]+", " ") -- Remove extra tabs and spaces.
if args.whitespace ~= "no" then
end
-- Replace single new lines with a single space, but leave double new lines
return text
-- and new lines only containing spaces or tabs before a second new line.
text = mw.ustring.gsub(text, "([^\n \t][ \t]*)\n([ \t]*[^\n \t])", "%1 %2")
text = text:gsub("[ \t]+", " ") -- Remove extra tabs and spaces.
end
return text
end
end


function p.delink(frame)
function p.delink(frame)
if not getArgs then
local args
getArgs = require('Module:Arguments').getArgs
if frame == mw.getCurrentFrame() then
end
-- We're being called via #invoke. If the invoking template passed any args, use
return p._delink(getArgs(frame, {wrappers = 'Template:Delink'}))
-- them. Otherwise, use the args that were passed into the template.
args = frame:getParent().args
for k, v in pairs(frame.args) do
args = frame.args
break
end
else
-- We're being called from another module or from the debug console, so assume
-- the args are passed in directly.
args = frame
end
return p._delink(args)
end
end



Latest revision as of 17:30, February 1, 2022

Documentation for this module may be created at Module:Delink/doc

-- This module de-links most wikitext.

require("Module:No globals")

local p = {}

local getArgs

local function delinkReversePipeTrick(s)
	if s:match("^%[%[|.*[|\n]") then -- Check for newlines or multiple pipes.
		return s
	end
	return s:match("%[%[|(.*)%]%]")

end

local function delinkPipeTrick(s)
	-- We need to deal with colons, brackets, and commas, per [[Help:Pipe trick]].
	-- First, remove the text before the first colon, if any.
	if s:match(":") then
		s = s:match("%[%[.-:(.*)|%]%]")
	-- If there are no colons, grab all of the text apart from the square brackets and the pipe.
	else
		s = s:match("%[%[(.*)|%]%]")
	end
	
	-- Next up, brackets and commas.
	if s:match("%(.-%)$") then -- Brackets trump commas.
		s = s:match("(.-) ?%(.-%)$")
	elseif s:match(",") then -- If there are no brackets, display only the text before the first comma.
		s = s:match("(.-),.*$")
	end
	return s
end

-- Return wikilink target |wikilinks=target
local function getDelinkedTarget(s)
	local result = s
	-- Deal with the reverse pipe trick.
	if result:match("%[%[|") then
		return delinkReversePipeTrick(result)
	end
	
	result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
	result = mw.text.decode(result, true) -- decode HTML entities.
	
	-- Check for bad titles. To do this we need to find the
	-- title area of the link, i.e. the part before any pipes.
	local target_area
	if result:match("|") then -- Find if we're dealing with a piped link.
		target_area = result:match("^%[%[(.-)|.*%]%]")
	else
		target_area = result:match("^%[%[(.-)%]%]")
	end

	-- Check for bad characters.
	if mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
		return s
	end
	
	return target_area
end

local function getDelinkedLabel(s)
	local result = s
	-- Deal with the reverse pipe trick.
	if result:match("%[%[|") then
		return delinkReversePipeTrick(result)
	end

	result = mw.uri.decode(result, "PATH") -- decode percent-encoded entities. Leave underscores and plus signs.
	result = mw.text.decode(result, true) -- decode HTML entities.

	-- Check for bad titles. To do this we need to find the
	-- title area of the link, i.e. the part before any pipes.
	local target_area
	if result:match("|") then -- Find if we're dealing with a piped link.
		target_area = result:match("^%[%[(.-)|.*%]%]")
	else
		target_area = result:match("^%[%[(.-)%]%]")
	end

	-- Check for bad characters.
	if mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") and mw.ustring.match(target_area, "[%[%]<>{}%%%c\n]") ~= "?" then
		return s
	end

	-- Check for categories, interwikis, and files.
	local colon_prefix = result:match("%[%[(.-):.*%]%]") or "" -- Get the text before the first colon.
	local ns = mw.site.namespaces[colon_prefix] -- see if this is a known namespace
	if mw.language.isKnownLanguageTag(colon_prefix) or (ns and (ns.canonicalName == "File" or ns.canonicalName == "Category")) then
		return ""
	end

	-- Remove the colon if the link is using the [[Help:Colon trick]].
	if result:match("%[%[:") then
		result = "[[" .. result:match("%[%[:(.*%]%])")
	end

	-- Deal with links using the [[Help:Pipe trick]].
	if mw.ustring.match(result, "^%[%[[^|]*|%]%]") then
		return delinkPipeTrick(result)
	end
	
	-- Find the display area of the wikilink
	if result:match("|") then -- Find if we're dealing with a piped link.
		result = result:match("^%[%[.-|(.+)%]%]")
		-- Remove new lines from the display of multiline piped links,
		-- where the pipe is before the first new line.
		result = result:gsub("\n", "")
	else
		result = result:match("^%[%[(.-)%]%]")
	end

	return result
end

local function delinkURL(s)
	-- Assume we have already delinked internal wikilinks, and that
	-- we have been passed some text between two square brackets [foo].
	
	-- If the text contains a line break it is not formatted as a URL, regardless of other content.
	if s:match("\n") then
		return s
	end
	
	-- Check if the text has a valid URL prefix and at least one valid URL character.
	local valid_url_prefixes = {"//", "http://", "https://", "ftp://", "gopher://", "mailto:", "news:", "irc://"} 
	local url_prefix
	for _ ,v in ipairs(valid_url_prefixes) do
		if mw.ustring.match(s, '^%[' .. v ..'[^"%s].*%]' ) then
			url_prefix = v
			break
		end
	end
	
	-- Get display text
	if not url_prefix then
		return s
	end
	s = s:match("^%[" .. url_prefix .. "(.*)%]") -- Grab all of the text after the URL prefix and before the final square bracket.
	s = s:match('^.-(["<> ].*)') or "" -- Grab all of the text after the first URL separator character ("<> ).
	s = mw.ustring.match(s, "^%s*(%S.*)$") or "" -- If the separating character was a space, trim it off.
	
	local s_decoded = mw.text.decode(s, true)
	if mw.ustring.match(s_decoded, "%c") then
		return s
	end

	return s_decoded

end

local function delinkLinkClass(text, pattern, delinkFunction)
	if type(text) ~= "string" then
		error("Attempt to de-link non-string input.", 2)
	end
	if type(pattern) ~= "string" or mw.ustring.sub(pattern, 1, 1) ~= "^" then
		error('Invalid pattern detected. Patterns must begin with "^".', 2)
	end
	-- Iterate over the text string, and replace any matched text. using the 
	-- delink function. We need to iterate character by character rather 
	-- than just use gsub, otherwise nested links aren't detected properly.
	local result = ""
	while text ~= "" do
		-- Replace text using one iteration of gsub.
		text = mw.ustring.gsub(text, pattern, delinkFunction, 1)
		-- Append the left-most character to the result string.
		result = result .. mw.ustring.sub(text, 1, 1)
		text = mw.ustring.sub(text, 2, -1)
	end
	return result
end

function p._delink(args)
	local text = args[1] or ""
	if args.refs == "yes" then
		-- Remove any [[Help:Strip markers]] representing ref tags. In most situations 
		-- this is not a good idea - only use it if you know what you are doing!
		text = mw.ustring.gsub(text, "UNIQ%w*%-ref%-%d*%-QINU", "")
	end
	if args.comments ~= "no" then
		text = text:gsub("<!%-%-.-%-%->", "") -- Remove html comments.
	end

	if args.wikilinks ~= "no" and args.wikilinks ~= "target" then
		-- De-link wikilinks and return the label portion of the wikilink.
		text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedLabel)
	elseif args.wikilinks == "target" then
		-- De-link wikilinks and return the target portions of the wikilink.
		text = delinkLinkClass(text, "^%[%[.-%]%]", getDelinkedTarget)
	end
	if args.urls ~= "no" then
		text = delinkLinkClass(text, "^%[.-%]", delinkURL) -- De-link URLs.
	end
	if args.whitespace ~= "no" then
		-- Replace single new lines with a single space, but leave double new lines
		-- and new lines only containing spaces or tabs before a second new line.
		text = mw.ustring.gsub(text, "([^\n \t][ \t]*)\n([ \t]*[^\n \t])", "%1 %2")
		text = text:gsub("[ \t]+", " ") -- Remove extra tabs and spaces.
	end
	return text
end

function p.delink(frame)
	if not getArgs then
		getArgs = require('Module:Arguments').getArgs
	end
	return p._delink(getArgs(frame, {wrappers = 'Template:Delink'}))
end

return p