In this blog, I am going to use a puppeteer to scrap Wikipedia pages
Puppeteer is a Node library that provides a high-level API to control Chrome or Chromium over the DevTools Protocol. which is headless
we are going to achieve this in two steps the first step scrap the Wikipedia page[en.wikipedia.org/wiki/List_of_programming_l.. to get all the programming names and URLs to specific Wikipedia programming language pages [en.wikipedia.org/wiki/:programminglanguageN.. and stored in an array(which contains an object of name and URLs
const puppeteer = require("puppeteer");//headless chrome
var fs = require("fs");
let List_of_programming_languages = [];// to store the name ad urls of programming languages
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(
"https://en.wikipedia.org/wiki/List_of_programming_languages"
);
await page.waitForTimeout(2000);//waiting for 2000 seconds to load the webpage
const getLanguageUrl = await page.evaluate(() => {
let urls = document.querySelectorAll(".div-col ul li a");//the class name for extracting anchor tag
const urlList = [...urls];
console.log(urlList);
const extractedUrls = urlList.map((u) => u.getAttribute("href"));//extracting only links from anchor tags
return extractedUrls;
});
const getLanguageName = await page.evaluate(() => {
let headingFromWeb = document.querySelectorAll(".div-col");//the class name for extracting div
const lanuagesNameList = [...headingFromWeb];
const extractedLanuagesNameList = lanuagesNameList.map((h) => h.innerText);//extracting only inner text from div
return extractedLanuagesNameList;
});
const allLanguagesUrls = [...getLanguageUrl];
const allLanguagesNames = [...getLanguageName];
const nameDataSplit = allLanguagesNames.join("").split("\n");
//saving the urls and name as single object
List_of_programming_languages = nameDataSplit.map((name, i) => {
return {
name,
url: allLanguagesUrls[i],
};
});
the second step using the above array of URLs. from this specific Wikipedia programming I have taken full data from table(which has class name .infobox) like [Paradigm, Designed by, Developer, First appeared, Website, etc..] including images and also took the first four-paragraph for description and generated an object for each programming pushed it into an array and saved it into the file.
const getEachLanguageDetails = async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
let allLanguagesDetails = [];
for (i = 0; i <= List_of_programming_languages.length; i++) {
try {
await page.goto(
"https://en.wikipedia.org" + List_of_programming_languages[i].url
);
await page.waitForTimeout(1000);
const name = await page.evaluate(() => {
return document.querySelector(".infobox-title")
? document.querySelector(".infobox-title").innerText
: undefined;
});
const img_result = await page.evaluate(() => {
return document.querySelector(".infobox-image a img ")
? document
.querySelector(".infobox-image a img ")
.getAttribute("src")
: undefined;
});
const getTableLabels = await page.evaluate(() => {
let labelsdata = document.querySelectorAll(
".infobox .infobox-label"
);
const labelList = [...labelsdata];
const extractedLabelList = labelList.map((h) => h.innerText);
return extractedLabelList;
});
const getTableData = await page.evaluate(() => {
let tableData = document.querySelectorAll(".infobox .infobox-data");
const tableDataList = [...tableData];
const extractedTableDataList = tableDataList.map((h) => h.innerText);
return extractedTableDataList;
});
const p1 = await page.evaluate(
() => document.querySelectorAll("p")[1].innerText
);
const p2 = await page.evaluate(
() => document.querySelectorAll("p")[2].innerText
);
const p3 = await page.evaluate(
() => document.querySelectorAll("p")[3].innerText
);
const p4 = await page.evaluate(
() => document.querySelectorAll("p")[4].innerText
);
let eachLanguageDescription = {
name: name ? name : List_of_programming_languages[i].name,
image: img_result ? "https:" + img_result : undefined,
description: p1 + " " + p2 + " " + p3 + " " + p4,
};
getTableLabels.forEach((e, i) => {
eachLanguageDescription[e] = getTableData[i];
});
allLanguagesDetails.push(eachLanguageDescription);
} catch (e) {
console.log(e);
}
}
fs.writeFile(
"languages.txt",
JSON.stringify(allLanguagesDetails),
(err) => {
if (err) throw err;
console.log('The "data to append" was appended to file!');
}
);
};
getEachLanguageDetails();
await browser.close();
})();
The final code will be available at wikipedia-scraping-puppeteer