Modul:template parser
Bu modul uchun Modul:template parser/doc nomli hujjat sahifasini yaratishingiz mumkin
local concat = table.concat
local get_entities = require("Module:utilities").get_entities
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local lower = string.lower
local match = string.match
local rawset = rawset
local select = select
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ulower = string.ulower
local m_parser = require("Module:parser")
local data = mw.loadData("Module:template parser/data")
local frame = mw.getCurrentFrame()
local export = {}
------------------------------------------------------------------------------------
--
-- Helper functions
--
------------------------------------------------------------------------------------
local function is_space(this)
return this == " " or
this == "\t" or
this == "\n" or
this == "\v" or
this == "\f" or
this == "\r"
end
-- Trims ASCII spacing characters.
-- Note: loops + sub make this much faster than the equivalent string patterns.
local function trim(str)
local n
for i = 1, #str do
if not is_space(sub(str, i, i)) then
n = i
break
end
end
if not n then
return ""
end
for i = #str, n, -1 do
if not is_space(sub(str, i, i)) then
return sub(str, n, i)
end
end
end
------------------------------------------------------------------------------------
--
-- Nodes
--
------------------------------------------------------------------------------------
local Node = m_parser.Node
local Wikitext = m_parser.Wikitext
local Tag = Node:new("tag")
function Tag:__tostring()
local open_tag = {"<", self.name}
if self.ignored then
return ""
elseif self.attributes then
for attr, value in pairs(self.attributes) do
insert(open_tag, " " .. attr .. "=\"" .. value .. "\"")
end
end
if self.self_closing then
insert(open_tag, "/>")
return concat(open_tag)
end
insert(open_tag, ">")
return concat(open_tag) .. concat(self) .. "</" .. self.name .. ">"
end
local Argument = Node:new("argument")
function Argument:__tostring()
if self[2] then
local output, i = {"{{{", tostring(self[1])}, 2
while self[i] do
insert(output, "|")
insert(output, tostring(self[i]))
i = i + 1
end
insert(output, "}}}")
return concat(output)
elseif self[1] then
return "{{{" .. tostring(self[1]) .. "}}}"
else
return "argument"
end
end
function Argument:next(i)
i = i + 1
if i <= 2 then
return self[i], self, i
end
end
local Parameter = Node:new("parameter")
function Parameter:__tostring()
if self.key then
return tostring(self.key) .. "=" .. Node.__tostring(self)
end
return Node.__tostring(self)
end
local Template = Node:new("template")
function Template:__tostring()
if self[2] then
local output, n = {"{{", tostring(self[1])}, 2
if self.colon then
insert(output, ":")
insert(output, tostring(self[3]))
n = 3
end
for i = n, #self do
insert(output, "|")
insert(output, tostring(self[i]))
end
insert(output, "}}")
return concat(output)
elseif self[1] then
return "{{" .. tostring(self[1]) .. "}}"
else
return "template"
end
end
-- Explicit parameter keys are converted to numbers if:
-- (a) They are integers, with no decimals (2.0) or leading zeroes (02).
-- (b) They are <= 2^53 and >= -2^53.
-- Note: Lua integers are only accurate to 2^53 - 1, so 2^53 and -2^53 have to be specifically checked for since Lua will evaluate 2^53 as equal to 2^53 + 1.
function Template:get_params()
local params, implicit, key, value = {}, 0
for i = 2, self.len do
if self[i].key then
key = trim(tostring(self[i].key))
if match(key, "^-?[1-9]%d*$") or key == "0" then
local num = tonumber(key)
key = (
num <= 9007199254740991 and num >= -9007199254740991 or
key == "9007199254740992" or
key == "-9007199254740992"
) and num or key
end
value = trim(Node.__tostring(self[i]))
else
implicit = implicit + 1
key = implicit
value = tostring(self[i])
end
params[key] = value
end
return params
end
------------------------------------------------------------------------------------
--
-- Parser
--
------------------------------------------------------------------------------------
local Parser = m_parser.Parser
-- Extension to the `new` method which also sets raw_head.
do
local _new = Parser.new
function Parser:new(text)
local parser = _new(self, text)
parser.raw_head = 1
return parser
end
end
-- Modified `advance` method which keeps track of raw_head.
function Parser:advance(n)
local head = self.head
if not n or n == 1 then
self.raw_head = self.raw_head + (self.raw_lens[head] or 0)
self.head = head + 1
elseif n > 1 then
local text = self.text
for _ = 1, n do
self.raw_head = self.raw_head + (self.raw_lens[head] or 0)
head = head + 1
end
self.head = head
else
local text = self.text
for _ = 1, -n do
head = head - 1
self.raw_head = self.raw_head - (self.raw_lens[head] or 0)
end
self.head = head
end
end
-- Extension to the `get` method which also resets raw_head if a bad route is returned.
do
local _get = Parser.get
function Parser:get(route, ...)
local raw_head = self.raw_head
local layer = _get(self, route, ...)
if layer == self.n.bad_route then
self.raw_head = raw_head
end
return layer
end
end
-- Argument.
-- First value is the argument name.
-- Second value is the argument's default value.
-- Any additional values are ignored: "{{{a|b|c}}}" is argument "a" with default value "b" (*not* "b|c").
do
local function handle_argument(self, this)
if this == "|" then
self:emit(Wikitext:new(self:pop_sublayer()))
self:push_sublayer()
elseif this == "}" and self:read(1) == "}" then
if self:read(2) == "}" then
self:emit(Wikitext:new(self:pop_sublayer()))
self:advance(2)
return self:pop()
end
return self:fail_route()
elseif this == "" then
return self:fail_route()
else
return self:block_handler(this)
end
end
function Parser:do_argument()
rawset(self.n, "handler", handle_argument)
self:push_sublayer()
end
function Parser:argument()
local argument = self:get("do_argument")
if argument == self.n.bad_route then
self:template()
else
if self.n.len == self.n.emit_pos then
local inner = self:remove()
if type(argument[1]) == "table" then
insert(argument[1], 1, inner)
else
argument[1] = Wikitext:new{inner, argument[1]}
end
end
self.n.braces = self.n.braces - 3
self.n.brace_head = self.n.brace_head - 3
argument.pos = self.n.brace_head
self:emit(Argument:new(argument))
end
end
end
-- Template.
do
local handle_name
local handle_parameter
function handle_name(self, this)
if this == "|" then
self:emit(Wikitext:new(self:pop_sublayer()))
self.n.handler = handle_parameter
self:push_sublayer()
elseif this == "}" and self:read(1) == "}" then
self:emit(Wikitext:new(self:pop_sublayer()))
self:advance()
return self:pop()
elseif this == "" then
return self:fail_route()
else
return self:block_handler(this)
end
end
function handle_parameter(self, this)
if this == "=" and not self.n.key and (
self:read(1) ~= "=" or
self:read(-1) ~= "\n" and self:read(-1) ~= ""
) then
local key = self:pop_sublayer()
self:push_sublayer()
rawset(self.n, "key", Wikitext:new(key))
elseif this == "|" then
self:emit(Parameter:new(self:pop_sublayer()))
self:push_sublayer()
elseif this == "}" and self:read(1) == "}" then
self:emit(Parameter:new(self:pop_sublayer()))
self:advance()
return self:pop()
elseif this == "" then
return self:fail_route()
else
return self:block_handler(this)
end
end
function Parser:do_template()
rawset(self.n, "handler", handle_name)
self:push_sublayer()
end
function Parser:template()
local template = self:get("do_template")
if template == self.n.bad_route then
self:advance(-1)
for _ = 1, self.n.braces do
self:emit(self.n.emit_pos, "{")
end
self.n.braces = 0
else
if self.n.len == self.n.emit_pos then
local inner = self:remove()
if type(template[1]) == "table" then
insert(template[1], 1, inner)
else
template[1] = Wikitext:new{inner, template[1]}
end
end
self.n.braces = self.n.braces - 2
self.n.brace_head = self.n.brace_head - 2
template.pos = self.n.brace_head
self:emit(Template:new(template))
end
end
function Parser:template_or_argument()
self:advance(2)
self.n.braces = 2
while self:read() == "{" do
self:advance()
self.n.braces = self.n.braces + 1
end
self.n.emit_pos = self.n.len + 1
self.n.brace_head = self.raw_head
repeat
if self.n.braces == 1 then
self:emit(self.n.emit_pos, "{")
break
elseif self.n.braces == 2 then
self:template()
else
self:argument()
end
self:advance()
until self.n.braces == 0
self:advance(-1)
end
end
-- Text not in <onlyinclude></onlyinclude>.
function Parser:not_onlyinclude()
local this, nxt, nxt2 = self:read(), self:read(1), self:read(2)
while not (
this == "" or
this == "<" and nxt == "onlyinclude" and nxt2 == ">"
) do
self:advance()
this, nxt, nxt2 = nxt, nxt2, self:read(2)
end
self:advance(2)
end
-- Tag.
do
local function is_ignored_tag(self, check)
return self.transcluded and check == "includeonly" or
not self.transcluded and (
check == "noinclude" or
check == "onlyinclude"
)
end
-- Handlers.
local handle_start
local handle_ignored_tag_start
local handle_ignored_tag
local handle_after_tag_name
local handle_before_attribute_name
local handle_attribute_name
local handle_before_attribute_value
local handle_quoted_attribute_value
local handle_unquoted_attribute_value
local handle_after_attribute_value
local handle_tag_block
local handle_end
function handle_start(self, this)
if this == "/" then
local check = lower(self:read(1))
if is_ignored_tag(self, check) then
self.n.name = check
self.n.ignored = true
self:advance()
self.n.handler = handle_ignored_tag_start
return
end
return self:fail_route()
end
local check = lower(this)
if is_ignored_tag(self, check) then
self.n.name = check
self.n.ignored = true
self.n.handler = handle_ignored_tag_start
elseif (
check == "noinclude" and self.transcluded or
check == "includeonly" and not self.transcluded
) then
self.n.name = check
self.n.ignored = true
self.n.handler = handle_after_tag_name
elseif data.tags[check] then
self.n.name = check
self.n.handler = handle_after_tag_name
else
return self:fail_route()
end
end
function handle_ignored_tag_start(self, this)
if this == ">" then
return self:pop()
elseif this == "/" and self:read(1) == ">" then
self.n.self_closing = true
self:advance()
return self:pop()
elseif is_space(this) then
self.n.handler = handle_ignored_tag
else
return self:fail_route()
end
end
function handle_ignored_tag(self, this)
if this == ">" then
return self:pop()
elseif this == "" then
return self:fail_route()
end
end
function handle_after_tag_name(self, this)
if this == "/" and self:read(1) == ">" then
self.n.self_closing = true
self:advance()
return self:pop()
elseif this == ">" then
self.n.handler = handle_tag_block
elseif is_space(this) then
self.n.handler = handle_before_attribute_name
else
return self:fail_route()
end
end
function handle_before_attribute_name(self, this)
if this == "/" and self:read(1) == ">" then
self.n.self_closing = true
self:advance()
return self:pop()
elseif this == ">" then
self.n.handler = handle_tag_block
elseif this ~= "/" and not is_space(this) then
self:push_sublayer(handle_attribute_name)
return self:consume()
elseif this == "" then
return self:fail_route()
end
end
function handle_attribute_name(self, this)
if this == "/" or this == ">" or is_space(this) then
self:pop_sublayer()
return self:consume()
elseif this == "=" then
-- Can't do `self.n.attr_name = ulower(concat(self:pop_sublayer()))` or Lua will take self.n to be the layer being popped.
local attr_name = ulower(concat(self:pop_sublayer()))
self.n.attr_name = attr_name
self.n.handler = handle_before_attribute_value
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_before_attribute_value(self, this)
if this == "/" or this == ">" then
handle_after_attribute_value(self, "")
return self:consume()
elseif is_space(this) then
handle_after_attribute_value(self, "")
elseif this == "\"" or this == "'" then
self:push_sublayer(handle_quoted_attribute_value)
rawset(self.n, "quoter", this)
elseif this == "" then
return self:fail_route()
else
self:push_sublayer(handle_unquoted_attribute_value)
return self:consume()
end
end
function handle_quoted_attribute_value(self, this)
if this == ">" then
handle_after_attribute_value(self, concat(self:pop_sublayer()))
return self:consume()
elseif this == self.n.quoter then
handle_after_attribute_value(self, concat(self:pop_sublayer()))
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_unquoted_attribute_value(self, this)
if this == "/" or this == ">" then
handle_after_attribute_value(self, concat(self:pop_sublayer()))
return self:consume()
elseif is_space(this) then
handle_after_attribute_value(self, concat(self:pop_sublayer()))
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_after_attribute_value(self, attr_value)
self.n.attributes = self.n.attributes or {}
self.n.attributes[self.n.attr_name] = attr_value
self.n.attr_name = nil
self.n.handler = handle_before_attribute_name
end
function handle_tag_block(self, this)
if (
this == "<" and
self:read(1) == "/" and
lower(self:read(2)) == self.n.name
) then
local tag_end = self:get("do_tag_end")
if tag_end == self.n.bad_route then
self:emit("<")
else
return self:pop()
end
elseif this == "" then
return self:fail_route()
else
self:emit(this)
end
end
function handle_end(self, this)
if this == ">" then
return self:pop()
elseif not is_space(this) then
return self:fail_route()
end
end
function Parser:do_tag()
rawset(self.n, "handler", handle_start)
self:advance()
end
function Parser:do_tag_end()
rawset(self.n, "handler", handle_end)
self:advance(3)
end
function Parser:tag()
local tag = self:get("do_tag")
if tag == self.n.bad_route then
self:emit("<")
else
self:emit(Tag:new(tag))
end
end
end
-- Block handlers.
-- These are blocks which can affect template/argument parsing, since they're also parsed by Parsoid at the same time (even though they aren't processed until later).
-- All blocks (including templates/arguments) can nest inside each other, but an inner block must be closed before the outer block which contains it. This is why, for example, the wikitext "{{template| [[ }}" will result in an unprocessed template, since the inner "[[" is treated as the opening of a wikilink block, which prevents "}}" from being treated as the closure of the template block. On the other hand, "{{template| [[ ]] }}" will process correctly, since the wikilink block is closed before the template closure. It makes no difference whether the block will be treated as valid or not when it's processed later on, so "{{template| [[ }} ]] }}" would also work, even though "[[ }} ]]" is not a valid wikilink.
-- Note that nesting also affects pipes and equals signs, in addition to block closures.
-- These blocks can be nested to any degree, so "{{template| [[ [[ [[ ]] }}" will not work, since only one of the three wikilink blocks has been closed. On the other hand, "{{template| [[ [[ [[ ]] ]] ]] }}" will work.
-- All blocks are implicitly closed by the end of the text, since their validity is irrelevant at this stage.
do
-- Headings
-- Opens with "\n=" (or "=" at the start of the text), and closes with "\n". Note that it doesn't matter whether the heading will fail to process due to a premature newline (e.g. if there are no closing signs), so at this stage the only thing that matters for closure is the newline.
-- Note: if directly inside a template block, a newline followed by a single equals sign is parsed as a parameter equals sign, not the opening of a new L1 heading block. This does not apply to any other heading levels. As such, "\n=}}" will successfully close a template, but "\n==}}" will not, since in the latter case the "}}" would fall inside the new heading block.
local function handle_heading_block(self, this)
if this == "\n" then
self:emit("\n")
return self:pop()
else
return self:block_handler(this)
end
end
-- Language conversion block.
-- Opens with "-{" and closes with "}-". However, templates/arguments take priority, so "-{{" is parsed as "-" followed by the opening of a template/argument block (depending on what comes after).
-- Note: Language conversion blocks aren't actually enabled on the English Wiktionary, but Parsoid still parses them at this stage.
local function handle_language_conversion_block(self, this)
if this == "}" and self:read(1) == "-" then
self:advance()
self:emit("}")
self:emit("-")
return self:pop()
else
return self:block_handler(this)
end
end
-- Wikilink block.
-- Opens with "[[" and closes with "]]".
local function handle_wikilink_block(self, this)
if this == "]" and self:read(1) == "]" then
self:advance()
self:emit("]")
self:emit("]")
return self:pop()
else
return self:block_handler(this)
end
end
function Parser:do_block(handler)
rawset(self.n, "handler", handler)
end
function Parser:block_handler(this)
if this == "-" and self:read(1) == "{" then
self:advance()
self:emit("-")
if self:read(1) == "{" then
self:template_or_argument()
else
self:emit_tokens(self:get("do_block", handle_language_conversion_block))
end
elseif this == "=" and (
self:read(-1) == "\n" or
self:read(-1) == ""
) then
self:advance()
self:emit("=")
self:emit_tokens(self:get("do_block", handle_heading_block))
elseif this == "[" and self:read(1) == "[" then
self:advance()
self:emit("[")
self:emit_tokens(self:get("do_block", handle_wikilink_block))
else
return self:main_handler(this)
end
end
end
function Parser:main_handler(this)
if this == "<" then
if (
self:read(1) == "!" and
self:read(2) == "-" and
self:read(3) == "-"
) then
self:advance(4)
local this, nxt, nxt2 = self:read(), self:read(1), self:read(2)
while not (
this == "" or
this == "-" and nxt == "-" and nxt2 == ">"
) do
self:advance()
this, nxt, nxt2 = nxt, nxt2, self:read(2)
end
self:advance(2)
elseif (
self.onlyinclude and
self:read(1) == "/" and
self:read(2) == "onlyinclude" and
self:read(3) == ">"
) then
self:advance(4)
self:not_onlyinclude()
else
self:tag()
end
elseif this == "{" and self:read(1) == "{" then
self:template_or_argument()
elseif this == "" then
return self:pop()
else
self:emit(this)
end
end
do
-- If `transcluded` is true, then the text is checked for a pair of onlyinclude tags. If these are found (even if they're in the wrong order), then the start of the page is treated as though it is preceded by a closing onlyinclude tag.
-- Note that onlyinclude tags *can* be implicitly closed by the end of the text, but the hard requirement above means this can only happen if either the tags are in the wrong order or there are multiple onlyinclude blocks.
function Parser:do_parse(raw_lens, str, transcluded, title)
self.raw_lens = raw_lens
self.title = title
if transcluded then
self.transcluded = true
if match(str, "<onlyinclude>") and match(str, "</onlyinclude>") then
self.onlyinclude = true
self:not_onlyinclude()
self:advance()
end
end
rawset(self.n, "handler", self.main_handler)
end
function export.parse(str, transcluded, title)
local text, raw_lens, start, n, this = {}, {}, 1, 0
for loc, char in gmatch(str, "()([%s!\"'%-/<=>%[%]{|}])") do
if loc > start then
n = n + 1
this = sub(str, start, loc - 1)
text[n] = this
raw_lens[n] = #this
end
n = n + 1
text[n] = char
raw_lens[n] = #char
start = loc + 1
end
if #str >= start then
n = n + 1
this = sub(str, start)
text[n] = this
raw_lens[n] = #this
end
return (select(2, Parser:parse{
text = text,
node = Wikitext,
route = "do_parse",
raw_lens,
str,
transcluded,
title
}))
end
end
do
-- Normalize the template name:
-- (1) Convert to string.
-- (2) Preprocess.
-- (3) Resolve any HTML entities.
-- (4) Convert all characters which the parser treats as spaces in links to conventional spaces.
-- (5) Remove consecutive spaces.
-- (6) Trim.
local function normalize_name(name)
name = gsub(gsub(get_entities(frame:preprocess(tostring(name))),
"[_\194\225-\227][\128\129\154\160]?[\128-\138\142\159\168\169\175]?",
data.name_spaces
), "%s+", " ")
-- Sub is fastest, since we know it's one character max.
local name_len = #name
return sub(name,
is_space(sub(name, 1, 1)) and 2 or 1,
is_space(sub(name, name_len)) and name_len - 1 or name_len
)
end
function export.parseTemplate(text, not_transcluded)
text = export.parse(text, not not_transcluded)
if text and text.type == "template" then
return normalize_name(text[1]), text:get_params()
end
end
function export.findTemplates(text, not_transcluded)
text = export.parse(text, not not_transcluded)
local iterate, node = text:__pairs("next_node")
return function()
repeat
node = iterate()
until not node or node.type == "template"
if node then
return normalize_name(node[1]), node:get_params(), tostring(node), node.pos
end
end
end
end
return export