JavaScript Extraction Techniques
Link Extraction
Dump All Links (Basic)
[...document.links].forEach(link => console.log(link.href));
Dump All Links Using querySelectorAll
[...document.querySelectorAll("a")].forEach(a => console.log(a.href));
Dump All Links Using for…of
for (const link of document.querySelectorAll("a")) {
console.log(link.href);
}
Dump All Links Using getElementsByTagName
[...document.getElementsByTagName("a")].forEach(a => console.log(a.href));
Dump All Links (Classic for-loop)
const links = document.getElementsByTagName("a");
for (let i = 0; i < links.length; i++) {
console.log(links[i].href);
}
Dump Unique Links
[...new Set([...document.querySelectorAll("a")].map(a => a.href))]
.forEach(url => console.log(url));
Open All Links in a New Tab (Clickable List)
const urls = [...document.links].map(l => l.href);
const tab = window.open();
tab.document.write("<html><body>");
urls.forEach(url => tab.document.write(`<a href="${url}" target="_blank">${url}</a><br>`));
tab.document.write("</body></html>");
tab.document.close();
Filtered Link Extraction
Internal Links Only
[...document.links]
.filter(a => a.hostname === location.hostname)
.forEach(a => console.log(a.href));
External Links Only
[...document.links]
.filter(a => a.hostname !== location.hostname)
.forEach(a => console.log(a.href));
PDF Links
[...document.links]
.filter(a => a.href.endsWith(".pdf"))
.forEach(a => console.log(a.href));
Download Attribute Links
[...document.querySelectorAll("a[download]")]
.forEach(a => console.log(a.href));
Mailto Links
[...document.querySelectorAll('a[href^="mailto:"]')]
.forEach(a => console.log(a.href));
Telephone Links
[...document.querySelectorAll('a[href^="tel:"]')]
.forEach(a => console.log(a.href));
Media Extraction
Capture All Images
[...document.images].forEach(img => console.log(img.src));
Capture Stylesheets
[...document.styleSheets].forEach(sheet => console.log(sheet.href));
Dump all image urls into console window
Array.from(document.images).forEach(({ src }) => console.log(src));
Dump all files by file extension to a List with links to a new tab
let links = [];
const fileExtensions = ['png', 'jpg', 'gif', 'mkv', 'tar', 'zip', 'rar', 'mp4', 'jpeg'];
document.querySelectorAll('img').forEach(img => {
let src = img.src;
let extension = src.split('.').pop().toLowerCase();
if (fileExtensions.includes(extension)) {
links.push(src);
}
});
// Open a new tab with a blank page
let newTab = window.open('about:blank', '_blank');
let newTabDocument = newTab.document;
// Create a list of href links
let ul = newTabDocument.createElement('ul');
links.forEach(link => {
let li = newTabDocument.createElement('li');
let a = newTabDocument.createElement('a');
a.href = link;
a.textContent = link.split('/').pop(); // Set the text content to the file name
a.style.display = 'block'; // Display each link on a new line
li.appendChild(a);
ul.appendChild(li);
});
newTabDocument.body.appendChild(ul);
Dump all images to a tab and show a preview in default size
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" ></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Email Extraction
Extract Emails Using Regex
const regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const html = document.documentElement.innerHTML;
let match;
while ((match = regex.exec(html))) {
console.log(match[0]);
}