Modul:template parser

Vikilug‘atdan olingan

Bu modul uchun Modul:template parser/doc nomli hujjat sahifasini yaratishingiz mumkin

local concat = table.concat
local get_entities = require("Module:utilities").get_entities
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local lower = string.lower
local match = string.match
local rawset = rawset
local select = select
local sub = string.sub
local tonumber = tonumber
local tostring = tostring
local type = type
local ulower = string.ulower

local m_parser = require("Module:parser")
local data = mw.loadData("Module:template parser/data")
local frame = mw.getCurrentFrame()

local export = {}

------------------------------------------------------------------------------------
--
-- Helper functions
--
------------------------------------------------------------------------------------

local function is_space(this)
	return this == " " or
		this == "\t" or
		this == "\n" or
		this == "\v" or
		this == "\f" or
		this == "\r"
end

-- Trims ASCII spacing characters.
-- Note: loops + sub make this much faster than the equivalent string patterns.
local function trim(str)
	local n
	for i = 1, #str do
		if not is_space(sub(str, i, i)) then
			n = i
			break
		end
	end
	if not n then
		return ""
	end
	for i = #str, n, -1 do
		if not is_space(sub(str, i, i)) then
			return sub(str, n, i)
		end
	end
end

------------------------------------------------------------------------------------
--
-- Nodes
--
------------------------------------------------------------------------------------

local Node = m_parser.Node
local Wikitext = m_parser.Wikitext

local Tag = Node:new("tag")

function Tag:__tostring()
	local open_tag = {"<", self.name}
	if self.ignored then
		return ""
	elseif self.attributes then
		for attr, value in pairs(self.attributes) do
			insert(open_tag, " " .. attr .. "=\"" .. value .. "\"")
		end
	end
	if self.self_closing then
		insert(open_tag, "/>")
		return concat(open_tag)
	end
	insert(open_tag, ">")
	return concat(open_tag) .. concat(self) .. "</" .. self.name .. ">"
end

local Argument = Node:new("argument")

function Argument:__tostring()
	if self[2] then
		local output, i = {"{{{", tostring(self[1])}, 2
		while self[i] do
			insert(output, "|")
			insert(output, tostring(self[i]))
			i = i + 1
		end
		insert(output, "}}}")
		return concat(output)
	elseif self[1] then
		return "{{{" .. tostring(self[1]) .. "}}}"
	else
		return "argument"
	end
end

function Argument:next(i)
	i = i + 1
	if i <= 2 then
		return self[i], self, i
	end
end

local Parameter = Node:new("parameter")

function Parameter:__tostring()
	if self.key then
		return tostring(self.key) .. "=" .. Node.__tostring(self)
	end
	return Node.__tostring(self)
end

local Template = Node:new("template")

function Template:__tostring()
	if self[2] then
		local output, n = {"{{", tostring(self[1])}, 2
		if self.colon then
			insert(output, ":")
			insert(output, tostring(self[3]))
			n = 3
		end
		for i = n, #self do
			insert(output, "|")
			insert(output, tostring(self[i]))
		end
		insert(output, "}}")
		return concat(output)
	elseif self[1] then
		return "{{" .. tostring(self[1]) .. "}}"
	else
		return "template"
	end
end

-- Explicit parameter keys are converted to numbers if:
-- (a) They are integers, with no decimals (2.0) or leading zeroes (02).
-- (b) They are <= 2^53 and >= -2^53.
-- Note: Lua integers are only accurate to 2^53 - 1, so 2^53 and -2^53 have to be specifically checked for since Lua will evaluate 2^53 as equal to 2^53 + 1.
function Template:get_params()
	local params, implicit, key, value = {}, 0
	for i = 2, self.len do
		if self[i].key then
			key = trim(tostring(self[i].key))
			if match(key, "^-?[1-9]%d*$") or key == "0" then
				local num = tonumber(key)
				key = (
					num <= 9007199254740991 and num >= -9007199254740991 or
					key == "9007199254740992" or
					key == "-9007199254740992"
				) and num or key
			end
			value = trim(Node.__tostring(self[i]))
		else
			implicit = implicit + 1
			key = implicit
			value = tostring(self[i])
		end
		params[key] = value
	end
	return params
end

------------------------------------------------------------------------------------
--
-- Parser
--
------------------------------------------------------------------------------------

local Parser = m_parser.Parser

-- Extension to the `new` method which also sets raw_head.
do
	local _new = Parser.new
	function Parser:new(text)
		local parser = _new(self, text)
		parser.raw_head = 1
		return parser
	end
end

-- Modified `advance` method which keeps track of raw_head.
function Parser:advance(n)
	local head = self.head
	if not n or n == 1 then
		self.raw_head = self.raw_head + (self.raw_lens[head] or 0)
		self.head = head + 1
	elseif n > 1 then
		local text = self.text
		for _ = 1, n do
			self.raw_head = self.raw_head + (self.raw_lens[head] or 0)
			head = head + 1
		end
		self.head = head
	else
		local text = self.text
		for _ = 1, -n do
			head = head - 1
			self.raw_head = self.raw_head - (self.raw_lens[head] or 0)
		end
		self.head = head
	end
end

-- Extension to the `get` method which also resets raw_head if a bad route is returned.
do
	local _get = Parser.get
	function Parser:get(route, ...)
		local raw_head = self.raw_head
		local layer = _get(self, route, ...)
		if layer == self.n.bad_route then
			self.raw_head = raw_head
		end
		return layer
	end
end

-- Argument.
-- First value is the argument name.
-- Second value is the argument's default value.
-- Any additional values are ignored: "{{{a|b|c}}}" is argument "a" with default value "b" (*not* "b|c").
do
	local function handle_argument(self, this)
		if this == "|" then
			self:emit(Wikitext:new(self:pop_sublayer()))
			self:push_sublayer()
		elseif this == "}" and self:read(1) == "}" then
			if self:read(2) == "}" then
				self:emit(Wikitext:new(self:pop_sublayer()))
				self:advance(2)
				return self:pop()
			end
			return self:fail_route()
		elseif this == "" then
			return self:fail_route()
		else
			return self:block_handler(this)
		end
	end
	
	function Parser:do_argument()
		rawset(self.n, "handler", handle_argument)
		self:push_sublayer()
	end

	function Parser:argument()
		local argument = self:get("do_argument")
		if argument == self.n.bad_route then
			self:template()
		else
			if self.n.len == self.n.emit_pos then
				local inner = self:remove()
				if type(argument[1]) == "table" then
					insert(argument[1], 1, inner)
				else
					argument[1] = Wikitext:new{inner, argument[1]}
				end
			end
			self.n.braces = self.n.braces - 3
			self.n.brace_head = self.n.brace_head - 3
			argument.pos = self.n.brace_head
			self:emit(Argument:new(argument))
		end
	end
end

-- Template.
do
	local handle_name
	local handle_parameter
	
	function handle_name(self, this)
		if this == "|" then
			self:emit(Wikitext:new(self:pop_sublayer()))
			self.n.handler = handle_parameter
			self:push_sublayer()
		elseif this == "}" and self:read(1) == "}" then
			self:emit(Wikitext:new(self:pop_sublayer()))
			self:advance()
			return self:pop()
		elseif this == "" then
			return self:fail_route()
		else
			return self:block_handler(this)
		end
	end
	
	function handle_parameter(self, this)
		if this == "=" and not self.n.key and (
			self:read(1) ~= "=" or
			self:read(-1) ~= "\n" and self:read(-1) ~= ""
		) then
			local key = self:pop_sublayer()
			self:push_sublayer()
			rawset(self.n, "key", Wikitext:new(key))
		elseif this == "|" then
			self:emit(Parameter:new(self:pop_sublayer()))
			self:push_sublayer()
		elseif this == "}" and self:read(1) == "}" then
			self:emit(Parameter:new(self:pop_sublayer()))
			self:advance()
			return self:pop()
		elseif this == "" then
			return self:fail_route()
		else
			return self:block_handler(this)
		end
	end
	
	function Parser:do_template()
		rawset(self.n, "handler", handle_name)
		self:push_sublayer()
	end
	
	function Parser:template()
		local template = self:get("do_template")
		if template == self.n.bad_route then
			self:advance(-1)
			for _ = 1, self.n.braces do
				self:emit(self.n.emit_pos, "{")
			end
			self.n.braces = 0
		else
			if self.n.len == self.n.emit_pos then
				local inner = self:remove()
				if type(template[1]) == "table" then
					insert(template[1], 1, inner)
				else
					template[1] = Wikitext:new{inner, template[1]}
				end
			end
			self.n.braces = self.n.braces - 2
			self.n.brace_head = self.n.brace_head - 2
			template.pos = self.n.brace_head
			self:emit(Template:new(template))
		end
	end
	
	function Parser:template_or_argument()
		self:advance(2)
		self.n.braces = 2
		while self:read() == "{" do
			self:advance()
			self.n.braces = self.n.braces + 1
		end
		self.n.emit_pos = self.n.len + 1
		self.n.brace_head = self.raw_head
		repeat
			if self.n.braces == 1 then
				self:emit(self.n.emit_pos, "{")
				break
			elseif self.n.braces == 2 then
				self:template()
			else
				self:argument()
			end
			self:advance()
		until self.n.braces == 0
		self:advance(-1)
	end
end

-- Text not in <onlyinclude></onlyinclude>.
function Parser:not_onlyinclude()
	local this, nxt, nxt2 = self:read(), self:read(1), self:read(2)
	while not (
		this == "" or
		this == "<" and nxt == "onlyinclude" and nxt2 == ">"
	) do
		self:advance()
		this, nxt, nxt2 = nxt, nxt2, self:read(2)
	end
	self:advance(2)
end

-- Tag.
do
	local function is_ignored_tag(self, check)
		return self.transcluded and check == "includeonly" or
			not self.transcluded and (
				check == "noinclude" or
				check == "onlyinclude"
			)
	end
	
	-- Handlers.
	local handle_start
	local handle_ignored_tag_start
	local handle_ignored_tag
	local handle_after_tag_name
	local handle_before_attribute_name
	local handle_attribute_name
	local handle_before_attribute_value
	local handle_quoted_attribute_value
	local handle_unquoted_attribute_value
	local handle_after_attribute_value
	local handle_tag_block
	local handle_end
	
	function handle_start(self, this)
		if this == "/" then
			local check = lower(self:read(1))
			if is_ignored_tag(self, check) then
				self.n.name = check
				self.n.ignored = true
				self:advance()
				self.n.handler = handle_ignored_tag_start
				return
			end
			return self:fail_route()
		end
		local check = lower(this)
		if is_ignored_tag(self, check) then
			self.n.name = check
			self.n.ignored = true
			self.n.handler = handle_ignored_tag_start
		elseif (
			check == "noinclude" and self.transcluded or
			check == "includeonly" and not self.transcluded
		) then
			self.n.name = check
			self.n.ignored = true
			self.n.handler = handle_after_tag_name
		elseif data.tags[check] then
			self.n.name = check
			self.n.handler = handle_after_tag_name
		else
			return self:fail_route()
		end
	end
	
	function handle_ignored_tag_start(self, this)
		if this == ">" then
			return self:pop()
		elseif this == "/" and self:read(1) == ">" then
			self.n.self_closing = true
			self:advance()
			return self:pop()
		elseif is_space(this) then
			self.n.handler = handle_ignored_tag
		else
			return self:fail_route()
		end
	end
	
	function handle_ignored_tag(self, this)
		if this == ">" then
			return self:pop()
		elseif this == "" then
			return self:fail_route()
		end
	end
	
	function handle_after_tag_name(self, this)
		if this == "/" and self:read(1) == ">" then
			self.n.self_closing = true
			self:advance()
			return self:pop()
		elseif this == ">" then
			self.n.handler = handle_tag_block
		elseif is_space(this) then
			self.n.handler = handle_before_attribute_name
		else
			return self:fail_route()
		end
	end
	
	function handle_before_attribute_name(self, this)
		if this == "/" and self:read(1) == ">" then
			self.n.self_closing = true
			self:advance()
			return self:pop()
		elseif this == ">" then
			self.n.handler = handle_tag_block
		elseif this ~= "/" and not is_space(this) then
			self:push_sublayer(handle_attribute_name)
			return self:consume()
		elseif this == "" then
			return self:fail_route()
		end
	end
	
	function handle_attribute_name(self, this)
		if this == "/" or this == ">" or is_space(this) then
			self:pop_sublayer()
			return self:consume()
		elseif this == "=" then
			-- Can't do `self.n.attr_name = ulower(concat(self:pop_sublayer()))` or Lua will take self.n to be the layer being popped.
			local attr_name = ulower(concat(self:pop_sublayer()))
			self.n.attr_name = attr_name
			self.n.handler = handle_before_attribute_value
		elseif this == "" then
			return self:fail_route()
		else
			self:emit(this)
		end
	end
	
	function handle_before_attribute_value(self, this)
		if this == "/" or this == ">" then
			handle_after_attribute_value(self, "")
			return self:consume()
		elseif is_space(this) then
			handle_after_attribute_value(self, "")
		elseif this == "\"" or this == "'" then
			self:push_sublayer(handle_quoted_attribute_value)
			rawset(self.n, "quoter", this)
		elseif this == "" then
			return self:fail_route()
		else
			self:push_sublayer(handle_unquoted_attribute_value)
			return self:consume()
		end
	end
	
	function handle_quoted_attribute_value(self, this)
		if this == ">" then
			handle_after_attribute_value(self, concat(self:pop_sublayer()))
			return self:consume()
		elseif this == self.n.quoter then
			handle_after_attribute_value(self, concat(self:pop_sublayer()))
		elseif this == "" then
			return self:fail_route()
		else
			self:emit(this)
		end
	end
			
	function handle_unquoted_attribute_value(self, this)
		if this == "/" or this == ">" then
			handle_after_attribute_value(self, concat(self:pop_sublayer()))
			return self:consume()
		elseif is_space(this) then
			handle_after_attribute_value(self, concat(self:pop_sublayer()))
		elseif this == "" then
			return self:fail_route()
		else
			self:emit(this)
		end
	end
	
	function handle_after_attribute_value(self, attr_value)
		self.n.attributes = self.n.attributes or {}
		self.n.attributes[self.n.attr_name] = attr_value
		self.n.attr_name = nil
		self.n.handler = handle_before_attribute_name
	end
	
	function handle_tag_block(self, this)
		if (
			this == "<" and
			self:read(1) == "/" and
			lower(self:read(2)) == self.n.name
		) then
			local tag_end = self:get("do_tag_end")
			if tag_end == self.n.bad_route then
				self:emit("<")
			else
				return self:pop()
			end
		elseif this == "" then
			return self:fail_route()
		else
			self:emit(this)
		end
	end
	
	function handle_end(self, this)
		if this == ">" then
			return self:pop()
		elseif not is_space(this) then
			return self:fail_route()
		end
	end
	
	function Parser:do_tag()
		rawset(self.n, "handler", handle_start)
		self:advance()
	end
	
	function Parser:do_tag_end()
		rawset(self.n, "handler", handle_end)
		self:advance(3)
	end
	
	function Parser:tag()
		local tag = self:get("do_tag")
		if tag == self.n.bad_route then
			self:emit("<")
		else
			self:emit(Tag:new(tag))
		end
	end
end

-- Block handlers.

-- These are blocks which can affect template/argument parsing, since they're also parsed by Parsoid at the same time (even though they aren't processed until later).

-- All blocks (including templates/arguments) can nest inside each other, but an inner block must be closed before the outer block which contains it. This is why, for example, the wikitext "{{template| [[ }}" will result in an unprocessed template, since the inner "[[" is treated as the opening of a wikilink block, which prevents "}}" from being treated as the closure of the template block. On the other hand, "{{template| [[ ]] }}" will process correctly, since the wikilink block is closed before the template closure. It makes no difference whether the block will be treated as valid or not when it's processed later on, so "{{template| [[ }} ]] }}" would also work, even though "[[ }} ]]" is not a valid wikilink.

-- Note that nesting also affects pipes and equals signs, in addition to block closures.

-- These blocks can be nested to any degree, so "{{template| [[ [[ [[ ]] }}" will not work, since only one of the three wikilink blocks has been closed. On the other hand, "{{template| [[ [[ [[ ]] ]] ]] }}" will work.

-- All blocks are implicitly closed by the end of the text, since their validity is irrelevant at this stage.
do
	-- Headings
	-- Opens with "\n=" (or "=" at the start of the text), and closes with "\n". Note that it doesn't matter whether the heading will fail to process due to a premature newline (e.g. if there are no closing signs), so at this stage the only thing that matters for closure is the newline.
	-- Note: if directly inside a template block, a newline followed by a single equals sign is parsed as a parameter equals sign, not the opening of a new L1 heading block. This does not apply to any other heading levels. As such, "\n=}}" will successfully close a template, but "\n==}}" will not, since in the latter case the "}}" would fall inside the new heading block.
	local function handle_heading_block(self, this)
		if this == "\n" then
			self:emit("\n")
			return self:pop()
		else
			return self:block_handler(this)
		end
	end
	
	-- Language conversion block.
	-- Opens with "-{" and closes with "}-". However, templates/arguments take priority, so "-{{" is parsed as "-" followed by the opening of a template/argument block (depending on what comes after).
	-- Note: Language conversion blocks aren't actually enabled on the English Wiktionary, but Parsoid still parses them at this stage.
	local function handle_language_conversion_block(self, this)
		if this == "}" and self:read(1) == "-" then
			self:advance()
			self:emit("}")
			self:emit("-")
			return self:pop()
		else
			return self:block_handler(this)
		end
	end
	
	-- Wikilink block.
	-- Opens with "[[" and closes with "]]".
	local function handle_wikilink_block(self, this)
		if this == "]" and self:read(1) == "]" then
			self:advance()
			self:emit("]")
			self:emit("]")
			return self:pop()
		else
			return self:block_handler(this)
		end
	end
	
	function Parser:do_block(handler)
		rawset(self.n, "handler", handler)
	end
	
	function Parser:block_handler(this)
		if this == "-" and self:read(1) == "{" then
			self:advance()
			self:emit("-")
			if self:read(1) == "{" then
				self:template_or_argument()
			else
				self:emit_tokens(self:get("do_block", handle_language_conversion_block))
			end
		elseif this == "=" and (
			self:read(-1) == "\n" or
			self:read(-1) == ""
		) then
			self:advance()
			self:emit("=")
			self:emit_tokens(self:get("do_block", handle_heading_block))
		elseif this == "[" and self:read(1) == "[" then
			self:advance()
			self:emit("[")
			self:emit_tokens(self:get("do_block", handle_wikilink_block))
		else
			return self:main_handler(this)
		end
	end
end

function Parser:main_handler(this)
	if this == "<" then
		 if (
			self:read(1) == "!" and
			self:read(2) == "-" and
			self:read(3) == "-"
		 ) then
			self:advance(4)
			local this, nxt, nxt2 = self:read(), self:read(1), self:read(2)
			while not (
				this == "" or
				this == "-" and nxt == "-" and nxt2 == ">"
			) do
				self:advance()
				this, nxt, nxt2 = nxt, nxt2, self:read(2)
			end
			self:advance(2)
		 elseif (
		 	self.onlyinclude and
		 	self:read(1) == "/" and
		 	self:read(2) == "onlyinclude" and
		 	self:read(3) == ">"
		) then
			self:advance(4)
			self:not_onlyinclude()
		else
			self:tag()
		end
	elseif this == "{" and self:read(1) == "{" then
		self:template_or_argument()
	elseif this == "" then
		return self:pop()
	else
		self:emit(this)
	end
end

do
	-- If `transcluded` is true, then the text is checked for a pair of onlyinclude tags. If these are found (even if they're in the wrong order), then the start of the page is treated as though it is preceded by a closing onlyinclude tag.
	-- Note that onlyinclude tags *can* be implicitly closed by the end of the text, but the hard requirement above means this can only happen if either the tags are in the wrong order or there are multiple onlyinclude blocks.
	function Parser:do_parse(raw_lens, str, transcluded, title)
		self.raw_lens = raw_lens
		self.title = title
		if transcluded then
			self.transcluded = true
			if match(str, "<onlyinclude>") and match(str, "</onlyinclude>") then
				self.onlyinclude = true
				self:not_onlyinclude()
				self:advance()
			end
		end
		rawset(self.n, "handler", self.main_handler)
	end
	
	function export.parse(str, transcluded, title)
		local text, raw_lens, start, n, this = {}, {}, 1, 0
		for loc, char in gmatch(str, "()([%s!\"'%-/<=>%[%]{|}])") do
			if loc > start then
				n = n + 1
				this = sub(str, start, loc - 1)
				text[n] = this
				raw_lens[n] = #this
			end
			n = n + 1
			text[n] = char
			raw_lens[n] = #char
			start = loc + 1
		end
		if #str >= start then
			n = n + 1
			this = sub(str, start)
			text[n] = this
			raw_lens[n] = #this
		end
		return (select(2, Parser:parse{
			text = text,
			node = Wikitext,
			route = "do_parse",
			raw_lens,
			str,
			transcluded,
			title
		}))
	end
end

do
	-- Normalize the template name:
	-- (1) Convert to string.
	-- (2) Preprocess.
	-- (3) Resolve any HTML entities.
	-- (4) Convert all characters which the parser treats as spaces in links to conventional spaces.
	-- (5) Remove consecutive spaces.
	-- (6) Trim.
	local function normalize_name(name)
		name = gsub(gsub(get_entities(frame:preprocess(tostring(name))),
			"[_\194\225-\227][\128\129\154\160]?[\128-\138\142\159\168\169\175]?",
			data.name_spaces
		), "%s+", " ")
		-- Sub is fastest, since we know it's one character max.
		local name_len = #name
		return sub(name,
			is_space(sub(name, 1, 1)) and 2 or 1,
			is_space(sub(name, name_len)) and name_len - 1 or name_len
		)
	end
	
	function export.parseTemplate(text, not_transcluded)
		text = export.parse(text, not not_transcluded)
		if text and text.type == "template" then
			return normalize_name(text[1]), text:get_params()
		end
	end
	
	function export.findTemplates(text, not_transcluded)
		text = export.parse(text, not not_transcluded)
		local iterate, node = text:__pairs("next_node")
		return function()
			repeat
				node = iterate()
			until not node or node.type == "template"
			if node then
				return normalize_name(node[1]), node:get_params(), tostring(node), node.pos
			end
		end
	end
end

return export