/**
* @fileOverview Contains the scraper class, responsible for extracting
* information from the warframe wikia mods pages.
* @author Pedro Miguel Pereira Serrano Martins
* @version 2.3.2
*/
/*jslint node: true */
"use strict";
let cheerio = require('cheerio');
let request = require("request");
let Promise = require("promise");
let HTTPStatus = require("http-status");
/**
* <p>Name of the Description column in the mods table.</p>
* @const {string}
* @readonly
*/
const COLUMN_DESCRIPTION = "Description";
/**
* <p>Name of the Polarity column in the mods table.</p>
* @const {string}
* @readonly
*/
const COLUMN_POLARITY = "Polarity";
/**
* <p>Keyword dictionary for specific words that the scrapper tests against.</p>
*
* <p>Usually it is a better idea to check for logic patterns or HTML tags, but
* when those are non existent I use certain keywords as a last resort.</p>
*
* @const {Object}
* @readonly
*/
const KEYWORDS = {
PVP_ONLY: "PvP",
TRANSMUTABLE: "Transmutable",
NA: "N/A"
};
/**
* <p>This class is responsible for making requests and extracting information
* from the Warframe Wikia. All its methods either query or process HTML
* information from that page. As all scrappers, when one day the wikia gets an
* uplift, so must this class.</p>
*
* <p>This class makes usage of Promises for all the methods that make requests,
* so a I advise you to be confident with them.</p>
*
* @see {@link https://www.toptal.com/javascript/javascript-promises}
*/
class ModScraper {
/**
* <p>Innitiazes a ModScraper instance with the given configurations.</p>
*
* @constructor
* @param {Object} config The configuration object with all the links
* and config parameters for the scrapper. It
* is in the JSON file "scraperConfig.json".
* @public
*/
constructor(config) {
this.config = config;
this.wikiaSource = this.config.sources.wikia;
this.request = this.config.network.request;
}
/**
* <p>Queries the Warframe Mods table containing all the Warframe mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getWarframeMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.warframe_mods);
}
/**
* <p>Queries the Rifle Mods table containing all the Rifle mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getRifleMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.rifle_mods);
}
/**
* <p>Queries the Shotgun Mods table containing all the Shotgun mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getShotgunMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.shotgun_mods);
}
/**
* <p>Queries the Pistol Mods table containing all the Pistol mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getPistolMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.pistol_mods);
}
/**
* <p>Queries the Melee Mods table containing all the Melee mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getMeleeMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.melee_mods);
}
/**
* <p>Queries the Sentinel Mods table containing all the Sentinel mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getSentinelMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.sentinel_mods);
}
/**
* <p>Queries the Kubrow Mods table containing all the Kubrow mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getKubrowMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.kubrow_mods);
}
/**
* <p>Queries the Aura Mods table containing all the Aura mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getAuraMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.aura_mods);
}
/**
* <p>Queries the Stance Mods table containing all the Stance mods and
* information relative to them.</p>
*
* @return {Promise} A Promise with a Json object containing the data.
* @public
*/
getStanceMods() {
return this.getTableInfo(this.wikiaSource.link + this.wikiaSource.pages.stance_mods);
}
/**
* <p>Makes a request to the given mod URL, downloads its information table and
* returns an object with the information parsed.</p>
*
* @param {string} modURL The URL of the wikia page containing the mod
* information to parse.
* @return {Promise} A Promise containning a JSON object representing the
* mod.
* @public
* @idea The Wiki should have the SlotType where the mod is to be equiped on
* the description table.
*/
getModInformation(modURL) {
return this.requestHTML(modURL).then(htmlBody => {
//for everything else
let $ = cheerio.load(htmlBody);
//hack to know if this is a stance
if (htmlBody.includes('<a href="/wiki/Stance" title="Stance">Stance</a>')) {
//The Description paragraph always strats with the name of the stance in bold.
let descriptionParagraph = $(htmlBody).find('div').find('p').find('b').parent();
let stanceInfo = {
Name: $("#WikiaPageHeader > div > div.header-column.header-title > h1").text().trim(),
Description: descriptionParagraph.text().trim(),
SlotType: "Stance",
URL: modURL,
Ranks: 3, //all stances have 3 ranks
Rarity: $("div.pi-item:nth-child(3) > div:nth-child(2)").text().trim(),
TraddingTax: $("div.pi-item:nth-child(4) > div:nth-child(2)").text().trim(),
Transmutable: $("#mw-content-text > div > aside > section > div:nth-child(6) > div > div").text().trim() == KEYWORDS.TRANSMUTABLE,
ImageURL: $("#mw-content-text > div.tooltip-content.Infobox_Parent > aside > figure > a > img").attr("src")
};
let polarityNode = $('a.image.image-thumbnail.link-internal').filter((i, el) => {
return $(el).attr('title') === 'Polarity';
});
if (polarityNode.length > 0)
stanceInfo.Polarity = polarityNode.find("img").attr("alt").trim().split(" ")[0].trim();
let WeaponsList = [];
let weaponsListNodes = descriptionParagraph.next().next().find("li");
let tmpWeaponObj = {};
let tmpWeaponName = "";
let tmpWeaponLinkNodes = [];
weaponsListNodes.each((index, elem) => {
tmpWeaponObj = {};
tmpWeaponName = $(elem).text().trim();
if (tmpWeaponName.includes("*")) {
tmpWeaponObj.Name = tmpWeaponName.replace("*", "");
tmpWeaponObj.MatchesPolarity = true;
}
else {
tmpWeaponObj.Name = tmpWeaponName;
tmpWeaponObj.MatchesPolarity = false;
}
tmpWeaponObj.Links = [];
tmpWeaponLinkNodes = $(elem).find("a");
tmpWeaponLinkNodes.each((linkIndex, linkElem) => {
tmpWeaponObj.Links.push($(linkElem).attr("href"));
});
WeaponsList.push(tmpWeaponObj);
});
stanceInfo.WeaponsList = WeaponsList;
let droppedByDiv = $("#mw-content-text > div.tooltip-content.Infobox_Parent > aside > section > div:nth-child(5) > div").toString();
let info = droppedByDiv.split("<br>");
let dropInfo = {};
let links = "";
stanceInfo.DroppedBy = [];
for (let tag of info) {
dropInfo = {};
//remove text between parenthesis, and it is useless and often confuses the parser
tag = tag.replace(/ *\([^)]*\) */g, "");
dropInfo.Name = $(tag).text().trim() || tag;
dropInfo.Links = [];
links = $(tag).find("a").add($(tag).filter("a"));
$(links).each(function(index, elem) {
dropInfo.Links.push($(this).attr("href"));
});
stanceInfo.DroppedBy.push(dropInfo);
}
return stanceInfo;
}
else {
let modInfo = {
Name: $("#WikiaPageHeader > div > div.header-column.header-title > h1").text().trim(),
Description: $("#mw-content-text > div.tooltip-content.Infobox_Parent").next().text().trim(),
Rarity: $("div.pi-item:nth-child(3) > div:nth-child(2)").text().trim(),
TraddingTax: $("div.pi-item:nth-child(4) > div:nth-child(2)").text().trim(),
URL: modURL,
Transmutable: $("#mw-content-text > div > aside > section > div:nth-child(6) > div > div").text().trim() == KEYWORDS.TRANSMUTABLE,
Ranks: $("#mw-content-text > table.emodtable").find("tr").length - 2,
ImageURL: $("#mw-content-text > div.tooltip-content.Infobox_Parent > aside > figure > a > img").attr("src")
};
let polarityNode = $('a.image.image-thumbnail.link-internal').filter((i, el) => {
return $(el).attr('title') === 'Polarity';
});
if (polarityNode.length > 0)
modInfo.Polarity = polarityNode.find("img").attr("alt").trim().split(" ")[0].trim();
modInfo.Tradable = (modInfo.TraddingTax != KEYWORDS.NA && modInfo.TraddingTax != "");
let droppedByDiv = $("#mw-content-text > div.tooltip-content.Infobox_Parent > aside > section > div:nth-child(5) > div").toString();
let info = droppedByDiv.split("<br>");
let dropInfo = {};
let links = "";
modInfo.DroppedBy = [];
for (let tag of info) {
dropInfo = {};
//remove text between parenthesis, and it is useless and often confuses the parser
tag = tag.replace(/ *\([^)]*\) */g, "");
dropInfo.Name = $(tag).text().trim() || tag;
dropInfo.Links = [];
links = $(tag).find("a").add($(tag).filter("a"));
$(links).each(function(index, elem) {
dropInfo.Links.push($(this).attr("href"));
});
modInfo.DroppedBy.push(dropInfo);
}
return modInfo;
}
}).catch(error => {
throw ("Error with " + modURL + ". Error: " + error);
});
}
/**
* <p>Auxiliary method, makes a request to the given URL and receives the
* HTML table from another table. Parses it to extract all valuable
* information and builts a JSON object containing it, which is then
* returned if the promise is fulfilled.</p>
*
* @param {string} url The complete URL of the webpage were the HTML
* table containing all the information is.
* @return {Promise} A Promise with a Json object containing the data.
* @private
*/
getTableInfo(url) {
return this.requestHTML(url).then((htmlBody) => {
let $ = cheerio.load(htmlBody);
//get table
let table = $("table.listtable.sortable");
//get rows from table
let trList = $(table).find("tr");
//fill the headers and remove the 1st row as it was processed
let headers = trList.first().children().text().trim().split("\n");
headers = headers.map(str => str.trim());
trList = trList.slice(1, trList.length);
//aux variables for loop
let result = [];
let findCamelCase = new RegExp("([a-z]+[A-Z][a-z]+)");
$(trList).each(function(index, elem) {
let currentTdList = $(this).children();
let item = {};
let line;
for (let tdIndex = 0; tdIndex < headers.length; tdIndex++) {
line = currentTdList.eq(tdIndex).text().trim();
if (headers[tdIndex] == COLUMN_POLARITY) {
item[headers[tdIndex]] = currentTdList.eq(tdIndex).find("img").attr("alt").split(" ")[0];
//Because the wikia has conflicting scripts, we must ensure we get the field we want.
item[headers[tdIndex] + "Link"] = currentTdList.eq(tdIndex).find("img").attr("data-src");
if (!(item[headers[tdIndex] + "Link"]))
item[headers[tdIndex] + "Link"] = currentTdList.eq(tdIndex).find("img").attr("src");
}
else if (headers[tdIndex] == COLUMN_DESCRIPTION) {
let words = line.split(" ");
for (let word of words) {
if (findCamelCase.test(word)) {
let wordMinusFirst = word.slice(-word.length + 1);
let newWord = word[0] + wordMinusFirst.replace(/([A-Z])/g, '. $1');
line = line.replace(word, newWord);
}
}
//http://stackoverflow.com/questions/6163169/replace-multiple-whitespaces-with-single-whitespace-in-javascript-string#
item[headers[tdIndex]] = line.replace(/\s+/g, " ") + ".";
//check if it is PvP only.
item.PvPOnly = false;
if (item[headers[tdIndex]].includes(KEYWORDS.PVP))
item.PvPOnly = true;
}
else {
//if the header has associated links, we take them
if (currentTdList.eq(tdIndex).find("a").length)
item[headers[tdIndex] + "Link"] = currentTdList.eq(tdIndex).find("a").attr("href");
item[headers[tdIndex]] = line;
}
}
result.push(item);
});
return result;
}).catch(error => {
throw error;
});
}
/**
* <p>Makes a GET request to the given URL to download its HTML content and
* returns a Promise of a result.</p>
*
* <p>On successfull completion of the request the promise returns the HTML
* body of the HTML page. On fail, it returns a string with the error.</p>
*
* <p>This method does not make multiple retries, but if an error occurs it
* differentiates between the error types with a best effort basis.</p>
*
* @todo Check if the page changed from last time before making the request.
*
* @param {string} theURL The URL of the page whose
* HTML content we want to
* download.
* @return {Promise} A Promise object with the body, or
* an exception.
* @throws PageUnreacheableException Generic error used when the given
* page cannot be reached.
* @throws TimeoutReadException Error generated when the given wikia
* page takes to long to read or write
* a response to us. Likely means the
* wikia server is overloaded with work.
* Genereted via Timeout.
* @throws TimeoutConnException Error issued when our server cannot
* connect to the wikia server. Likely
* means that the wikia server is down,
* or that there is a connection issue
* on our side. Genereted via Timeout.
* @public
*/
requestHTML(url) {
let self = this;
return new Promise(function(fulfil, reject) {
self.request.url = url;
request(self.request,
function(error, response, body) {
if (!error && response.statusCode == HTTPStatus.OK)
fulfil(body);
else if (error.code === 'ETIMEDOUT') {
if (error.connect)
//Tried to establish a connection to the wiki server and failed
reject("ConnectionTimeoutException for " + url);
else
//The wiki server is too slow responding (likely overload)
reject("ReadTimeoutException for " + url);
}
else
//general error ocurred
reject("PageUnreacheableException for " + url);
});
});
}
}
module.exports = ModScraper;