Module:Citation/CS1/COinS

From Viki
Jump to navigation Jump to search

Documentation for this module may be created at Module:Citation/CS1/COinS/doc

  1 --[[--------------------------< F O R W A R D   D E C L A R A T I O N S >--------------------------------------
  2 ]]
  3 
  4 local is_set, in_array, remove_wiki_link, strip_apostrophe_markup;				-- functions in Module:Citation/CS1/Utilities
  5 
  6 local cfg;																		-- table of configuration tables that are defined in Module:Citation/CS1/Configuration
  7 
  8 
  9 --[[--------------------------< M A K E _ C O I N S _ T I T L E >----------------------------------------------
 10 
 11 Makes a title for COinS from Title and / or ScriptTitle (or any other name-script pairs)
 12 
 13 Apostrophe markup (bold, italics) is stripped from each value so that the COinS metadata isn't corrupted with strings
 14 of %27%27...
 15 
 16 ]]
 17 
 18 local function make_coins_title (title, script)
 19 	if is_set (title) then
 20 		title = strip_apostrophe_markup (title);								-- strip any apostrophe markup
 21 	else
 22 		title='';																-- if not set, make sure title is an empty string
 23 	end
 24 	if is_set (script) then
 25 		script = script:gsub ('^%l%l%s*:%s*', '');								-- remove language prefix if present (script value may now be empty string)
 26 		script = strip_apostrophe_markup (script);								-- strip any apostrophe markup
 27 	else
 28 		script='';																-- if not set, make sure script is an empty string
 29 	end
 30 	if is_set (title) and is_set (script) then
 31 		script = ' ' .. script;													-- add a space before we concatenate
 32 	end
 33 	return title .. script;														-- return the concatenation
 34 end
 35 
 36 
 37 --[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >----------------------------------
 38 
 39 Returns a string where all of lua's magic characters have been escaped.  This is important because functions like
 40 string.gsub() treat their pattern and replace strings as patterns, not literal strings.
 41 ]]
 42 
 43 local function escape_lua_magic_chars (argument)
 44 	argument = argument:gsub("%%", "%%%%");										-- replace % with %%
 45 	argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1");				-- replace all other lua magic pattern characters
 46 	return argument;
 47 end
 48 
 49 
 50 --[[--------------------------< G E T _ C O I N S _ P A G E S >------------------------------------------------
 51 
 52 Extract page numbers from external wikilinks in any of the |page=, |pages=, or |at= parameters for use in COinS.
 53 
 54 ]]
 55 
 56 local function get_coins_pages (pages)
 57 	local pattern;
 58 	if not is_set (pages) then return pages; end								-- if no page numbers then we're done
 59 	
 60 	while true do
 61 		pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]");					-- pattern is the opening bracket, the url and following space(s): "[url "
 62 		if nil == pattern then break; end										-- no more urls
 63 		pattern = escape_lua_magic_chars (pattern);								-- pattern is not a literal string; escape lua's magic pattern characters
 64 		pages = pages:gsub(pattern, "");										-- remove as many instances of pattern as possible
 65 	end
 66 	pages = pages:gsub("[%[%]]", "");											-- remove the brackets
 67 	pages = pages:gsub("–", "-" );							-- replace endashes with hyphens
 68 	pages = pages:gsub("&%w+;", "-" );						-- and replace html entities (&ndash; etc.) with hyphens; do we need to replace numerical entities like &#32; and the like?
 69 	return pages;
 70 end
 71 
 72 
 73 --[=[-------------------------< C O I N S _ R E P L A C E _ M A T H _ S T R I P M A R K E R >------------------
 74 
 75 There are three options for math markup rendering that depend on the editor's math preference settings.  These
 76 settings are at [[Special:Preferences#mw-prefsection-rendering]] and are
 77 	PNG images
 78 	TeX source
 79 	MathML with SVG or PNG fallback
 80 
 81 All three are heavy with html and css which doesn't belong in the metadata.
 82 
 83 Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings
 84 of the last editor to save the page.
 85 
 86 This function gets the rendered form of an equation according to the editor's preference before the page is saved.  It
 87 then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so
 88 that the page is saved without extraneous html/css markup and with a reasonably readable text form of the equation.
 89 
 90 When a replacement is made, this function returns true and the value with replacement; otherwise false and the intital
 91 value.  To replace multipe equations it is necesary to call this function from within a loop.
 92 
 93 ]=]
 94 
 95 local function coins_replace_math_stripmarker (value)
 96 	local stripmarker = cfg.stripmarkers['math'];
 97 	local rendering = value:match (stripmarker);								-- is there a math stripmarker
 98 
 99 	if not rendering then														-- when value doesn't have a math stripmarker, abandon this test
100 		return false, value;
101 	end
102 	
103 	rendering = mw.text.unstripNoWiki (rendering);								-- convert stripmarker into rendered value (or nil? ''? when math render error)
104 	
105 	if rendering:match ('alt="[^"]+"') then										-- if PNG math option
106 		rendering = rendering:match ('alt="([^"]+)"');							-- extract just the math text
107 	elseif rendering:match ('$%s+.+%s+%$') then									-- if TeX math option; $ is legit character that is escapes as \$
108 		rendering = rendering:match ('$%s+(.+)%s+%$')							-- extract just the math text
109 	elseif rendering:match ('<annotation[^>]+>.+</annotation>') then			-- if MathML math option
110 		rendering = rendering:match ('<annotation[^>]+>(.+)</annotation>')		-- extract just the math text
111 	else
112 		return false, value;													-- had math stripmarker but not one of the three defined forms
113 	end
114 	
115 	return true, value:gsub (stripmarker, rendering, 1);
116 end
117 
118 
119 --[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------
120 
121 Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities.
122 
123 2015-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaces math stripmarkers with the appropriate content
124 when it shouldn't.  See https://phabricator.wikimedia.org/T121085 and Wikipedia_talk:Lua#stripmarkers_and_mw.text.unstripNoWiki.28.29
125 
126 TODO: move the replacement patterns and replacement values into a table in /Configuration similar to the invisible
127 characters table?
128 
129 ]]
130 
131 local function coins_cleanup (value)
132 	local replaced = true;														-- default state to get the do loop running
133 
134 	while replaced do															-- loop until all math stripmarkers replaced
135 		replaced, value = coins_replace_math_stripmarker (value);				-- replace math stripmarker with text representation of the equation
136 	end
137 
138 	value = value:gsub (cfg.stripmarkers['math'], "MATH RENDER ERROR");						-- one or more couldn't be replaced; insert vague error message
139 	
140 	value = mw.text.unstripNoWiki (value);										-- replace nowiki stripmarkers with their content
141 	value = value:gsub ('<span class="nowrap" style="padding%-left:0%.1em;">&#39;(s?)</span>', "'%1");	-- replace {{'}} or {{'s}} with simple apostrophe or apostrophe-s
142 	value = value:gsub ('&nbsp;', ' ');											-- replace &nbsp; entity with plain space
143 	value = value:gsub ('\226\128\138', ' ');									-- replace hair space with plain space
144 	if not mw.ustring.find (value, cfg.indic_script) then						-- don't remove zero width joiner characters from indic script
145 		value = value:gsub ('&zwj;', '');											-- remove &zwj; entities
146 		value = mw.ustring.gsub (value, '[\226\128\141\226\128\139\194\173]', '');	-- remove zero-width joiner, zero-width space, soft hyphen
147 	end
148 	value = value:gsub ('[\009\010\013]', ' ');									-- replace horizontal tab, line feed, carriage return with plain space
149 	return value;
150 end
151 
152 
153 --[[--------------------------< C O I N S >--------------------------------------------------------------------
154 
155 COinS metadata (see <http://ocoins.info/>) allows automated tools to parse the citation information.
156 
157 ]]
158 
159 local function COinS(data, class)
160 	if 'table' ~= type(data) or nil == next(data) then
161 		return '';
162 	end
163 
164 	for k, v in pairs (data) do													-- spin through all of the metadata parameter values
165 		if 'ID_list' ~= k and 'Authors' ~= k then								-- except the ID_list and Author tables (author nowiki stripmarker done when Author table processed)
166 			data[k] = coins_cleanup (v);
167 		end
168 	end
169 
170 	local ctx_ver = "Z39.88-2004";
171 	
172 	-- treat table strictly as an array with only set values.
173 	local OCinSoutput = setmetatable( {}, {
174 		__newindex = function(self, key, value)
175 			if is_set(value) then
176 				rawset( self, #self+1, table.concat{ key, '=', mw.uri.encode( remove_wiki_link( value ) ) } );
177 			end
178 		end
179 	});
180 	
181 	if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn', 'journal', 'news', 'magazine'}) or (in_array (class, {'conference', 'interview', 'map', 'press release', 'web'}) and is_set(data.Periodical)) or 
182 		('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then
183 			OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal";			-- journal metadata identifier
184 			if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn'}) then	-- set genre according to the type of citation template we are rendering
185 				OCinSoutput["rft.genre"] = "preprint";							-- cite arxiv, cite biorxiv, cite citeseerx, cite ssrn
186 			elseif 'conference' == class then
187 				OCinSoutput["rft.genre"] = "conference";						-- cite conference (when Periodical set)
188 			elseif 'web' == class then
189 				OCinSoutput["rft.genre"] = "unknown";							-- cite web (when Periodical set)
190 			else
191 				OCinSoutput["rft.genre"] = "article";							-- journal and other 'periodical' articles
192 			end
193 			OCinSoutput["rft.jtitle"] = data.Periodical;						-- journal only
194 			OCinSoutput["rft.atitle"] = data.Title;								-- 'periodical' article titles
195 
196 																				-- these used only for periodicals
197 			OCinSoutput["rft.ssn"] = data.Season;								-- keywords: winter, spring, summer, fall
198 			OCinSoutput["rft.chron"] = data.Chron;								-- free-form date components
199 			OCinSoutput["rft.volume"] = data.Volume;							-- does not apply to books
200 			OCinSoutput["rft.issue"] = data.Issue;
201 			OCinSoutput["rft.pages"] = data.Pages;								-- also used in book metadata
202 
203 	elseif 'thesis' ~= class then												-- all others except cite thesis are treated as 'book' metadata; genre distinguishes
204 		OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:book";					-- book metadata identifier
205 		if 'report' == class or 'techreport' == class then						-- cite report and cite techreport
206 			OCinSoutput["rft.genre"] = "report";
207 		elseif 'conference' == class then										-- cite conference when Periodical not set
208 			OCinSoutput["rft.genre"] = "conference";
209 			OCinSoutput["rft.atitle"] = data.Chapter;							-- conference paper as chapter in proceedings (book)
210 		elseif in_array (class, {'book', 'citation', 'encyclopaedia', 'interview', 'map'}) then
211 			if is_set (data.Chapter) then
212 				OCinSoutput["rft.genre"] = "bookitem";
213 				OCinSoutput["rft.atitle"] = data.Chapter;						-- book chapter, encyclopedia article, interview in a book, or map title
214 			else
215 				if 'map' == class or 'interview' == class then
216 					OCinSoutput["rft.genre"] = 'unknown';						-- standalone map or interview
217 				else
218 					OCinSoutput["rft.genre"] = 'book';							-- book and encyclopedia
219 				end
220 			end
221 		else	--{'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'}
222 			OCinSoutput["rft.genre"] = "unknown";
223 		end
224 		OCinSoutput["rft.btitle"] = data.Title;									-- book only
225 		OCinSoutput["rft.place"] = data.PublicationPlace;						-- book only
226 		OCinSoutput["rft.series"] = data.Series;								-- book only
227 		OCinSoutput["rft.pages"] = data.Pages;									-- book, journal
228 		OCinSoutput["rft.edition"] = data.Edition;								-- book only
229 		OCinSoutput["rft.pub"] = data.PublisherName;							-- book and dissertation
230 		
231 	else																		-- cite thesis
232 		OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:dissertation";			-- dissertation metadata identifier
233 		OCinSoutput["rft.title"] = data.Title;									-- dissertation (also patent but that is not yet supported)
234 		OCinSoutput["rft.degree"] = data.Degree;								-- dissertation only
235 		OCinSoutput['rft.inst'] = data.PublisherName;							-- book and dissertation
236 	end
237 																				-- and now common parameters (as much as possible)
238 	OCinSoutput["rft.date"] = data.Date;										-- book, journal, dissertation
239 	
240 	for k, v in pairs( data.ID_list ) do										-- what to do about these? For now assume that they are common to all?
241 		if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end
242 		local id = cfg.id_handlers[k].COinS;
243 		if string.sub( id or "", 1, 4 ) == 'info' then							-- for ids that are in the info:registry
244 			OCinSoutput["rft_id"] = table.concat{ id, "/", v };
245 		elseif string.sub (id or "", 1, 3 ) == 'rft' then						-- for isbn, issn, eissn, etc that have defined COinS keywords
246 			OCinSoutput[ id ] = v;
247 		elseif id then															-- when cfg.id_handlers[k].COinS is not nil
248 			OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k].prefix, v };	-- others; provide a url
249 		end
250 	end
251 
252 	local last, first;
253 	for k, v in ipairs( data.Authors ) do
254 		last, first = coins_cleanup (v.last), coins_cleanup (v.first or '');	-- replace any nowiki strip markers, non-printing or invisible characers
255 		if k == 1 then															-- for the first author name only
256 			if is_set(last)  and is_set(first) then								-- set these COinS values if |first= and |last= specify the first author name
257 				OCinSoutput["rft.aulast"] = last;								-- book, journal, dissertation
258 				OCinSoutput["rft.aufirst"] = first;								-- book, journal, dissertation
259 			elseif is_set(last) then 
260 				OCinSoutput["rft.au"] = last;									-- book, journal, dissertation -- otherwise use this form for the first name
261 			end
262 		else																	-- for all other authors
263 			if is_set(last) and is_set(first) then
264 				OCinSoutput["rft.au"] = table.concat{ last, ", ", first };		-- book, journal, dissertation
265 			elseif is_set(last) then
266 				OCinSoutput["rft.au"] = last;									-- book, journal, dissertation
267 			end
268 		end
269 	end
270 
271 	OCinSoutput.rft_id = data.URL;
272 	OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage };
273 	OCinSoutput = setmetatable( OCinSoutput, nil );
274 	
275 	-- sort with version string always first, and combine.
276 	--table.sort( OCinSoutput );
277 	table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver );  -- such as "Z39.88-2004"
278 	return table.concat(OCinSoutput, "&");
279 end
280 
281 
282 --[[--------------------------< S E T _ S E L E C T E D _ M O D U L E S >--------------------------------------
283 
284 Sets local cfg table and imported functions table to same (live or sandbox) as that used by the other modules.
285 
286 ]]
287 
288 local function set_selected_modules (cfg_table_ptr, utilities_page_ptr)
289 	cfg = cfg_table_ptr;
290 
291 	is_set = utilities_page_ptr.is_set;											-- import functions from selected Module:Citation/CS1/Utilities module
292 	in_array = utilities_page_ptr.in_array;
293 	remove_wiki_link = utilities_page_ptr.remove_wiki_link;
294 	strip_apostrophe_markup = utilities_page_ptr.strip_apostrophe_markup;
295 end
296 
297 
298 --[[--------------------------< E X P O R T E D   F U N C T I O N S >------------------------------------------
299 ]]
300 
301 return {
302 	make_coins_title = make_coins_title,
303 	get_coins_pages = get_coins_pages,
304 	COinS = COinS,
305 	set_selected_modules = set_selected_modules,
306 	}