Why is variable body empty? how to get body tag content?
I am writing code for google chrome extension. I plan to use the extension for personal use only. For web scraping. To finally analyze the text. I want to play with text.
background.js
chrome.tabs.onUpdated.addListener(function(tabId, changeInfo, tab) {
if(document.readyState == "complete"){
var url = changeInfo.url;
// I want to save the url to a file(url.txt)
var body = document.body.innerText;
// why is body empty?????
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text = item "\n";
}
// I want to save the text to a file(collection.txt)
}
});
manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
CodePudding user response:
Based on the manifest.json
you posted it looks like you are running your code as a background script.
Background scripts don't have direct access to the loaded page content - this is why the body is empty in your code.
Instead, you will need to use a content script to access the page content and then message that data across to your background script for processing.
Here is an example setup using a background and content script that should let you retrieve and process the page content when a tab loads (not tested, but should point you in the right direction).
Thanks to the ResourceOverride extension which I used as a reference for writing the example below.
background.js
// background.js
chrome.runtime.onMessage.addListener(function(message, sender){
if (!message || typeof message !== 'object' || !sender.tab){
// Ignore messages that weren't sent by our content script.
return;
}
switch (message.action){
case 'receiveBodyText': {
processBodyText(sender.tab, message.bodyText);
break;
}
}
});
function processBodyText(tab, bodyText){
var url = tab.url;
// I want to save the url to a file(url.txt)
// TODO: Process your bodyText
var pattern = /[A-Z].*?\./g;
var result = text.match(pattern);
result.forEach(myFunction);
function myFunction(item) {
text = item "\n";
}
// I want to save the text to a file(collection.txt)
}
content.js
// content.js
window.addEventListener('load', function(){
let bodyText = document.body.innerText;
chrome.runtime.sendMessage({
action: 'receiveBodyText',
bodyText: bodyText
});
});
manifest.json
// manifest.json
{
"name": "Parser",
"version": "1",
"manifest_version": 2,
"background": {
"scripts":["background.js"]
},
"content_scripts": [{
"matches" : [
"http://*/*",
"https://*/*"
],
"js": ["content.js"]
}],
"permissions": [
"tabs",
"activeTab",
"storage",
"http://*/*",
"https://*/*"
]
}
Information and docs
A note on WebExtension API differences between firefox and chrome:
Chrome uses the
chrome
namespace, Firefox uses the future-standardbrowser
namespace.So code written in Chrome would use:
chrome.tabs.onUpdated(...)
and the equivalent in Firefox would be:browser.tabs.onUpdated(...)
Be aware of that when reading the docs and reading example extensions.
Background scripts
- do not have access to the loaded page
- have full access to the WebExtensions API
Content scripts
- have full access to the loaded page
- have only limited access to the WebExtensions API
- Chrome content scripts docs
- MDN content scripts docs
- Docs on communication between content scripts and background scripts
WebExtensions API
other useful links
- MDN WebExtensions example Github repository
- MDN "Anatomy of a WebExtension"
- MDN detailed browser WebExtensions support tables
- ResourceOverride extension - this is a fairly complex extension that uses both background and content scripts. I this used as a reference/example to better understand how extensions are written.