I have a lots of javascript code with different structure but inside every js code, there is multiple json with similar structure.
I want to parse only those object whose has key "@context": "https://schema.org",
I have written this pattern, but it is matching the joson with with some javascript code. I want to only grabe the json object, that's it.can anyone help me?
(\{\s\S\@context\"\:\s\"https\:\//schema\.org\"\,[\s\S] )
Here you go for example code:
);
jQuery(function ($) {
$('.cst_CERTIFIED_DEALER .cstBtn').append('<img src="//pictures.dealer.com/p/pellegrinoautosalesllc/0869/88d3d9a6608ebd537372b5db5fdde7b1x.jpg" alt="" />').css({'background':'none'});
});
jQuery(function ($) {
var $ddcValueStatementHeader = $('.value-statement-header [data-widget-id="template-header1"]');
if($ddcValueStatementHeader.length) {
$ddcValueStatementHeader.append($(".cst_CARFAX")).find(".cstBtn").css('z-index','950');
} else {
$('.header-default[data-widget-id="template-header1"]').append($(".cst_CARFAX")).find(".cstBtn").css('z-index','950');
}
$(".cst_CARFAX .cstBtn").css('cursor','auto');
});
jQuery(function ($) {
$('.cst_CARFAX .cstBtn').append('<img src="/sites/p/pellegrinoautosalesllc/images/carfax-logo.png" alt="CARFAX" />').css({'background':'none'});
});
jQuery(function ($) {
var $ddcValueStatementHeader = $('.value-statement-header [data-widget-id="template-header1"]');
if($ddcValueStatementHeader.length) {
$ddcValueStatementHeader.append($(".cst_EDMUNDS_AWARD")).find(".cstBtn").css('z-index','950');
} else {
$('.header-default[data-widget-id="template-header1"]').append($(".cst_EDMUNDS_AWARD")).find(".cstBtn").css('z-index','950');
}
$(".cst_EDMUNDS_AWARD .cstBtn").css('cursor','auto');
});
jQuery(function ($) {
$('.cst_EDMUNDS_AWARD .cstBtn').append('<img src="//pictures.dealer.com/p/pellegrinoautosalesllc/0002/6f1c3979ac0afac9b8d12ffb90e5af73x.jpg" alt="" />').css({'background':'none'});
});
var el = document.createElement('script');
el.type = 'application/ld json';
el.id = 'ddc-schemaorg-integration';
tpsSchemaJson = ( typeof tpsSchemaJson != 'undefined' && tpsSchemaJson instanceof Array ) ? tpsSchemaJson : [];
// Car specific data
var vehicleSchema = {
"@context": "https://schema.org",
"@type": "Car",
"description": document.head.querySelector("[name=description]") ? document.head.querySelector("[name=description]").content : "",
"vehicleModelDate": "2015",
"manufacturer": "Ram",
"model": "1500",
"sku": "08c765ea0a0e0a922cefdf66496c54cd",
"bodyType": "Truck Crew Cab",
"itemCondition": "used",
"url": location.origin location.pathname,
"vehicleIdentificationNumber": "3C6RR7LT1FG710130",
"fuelEfficiency": ["16","23"],
"driveWheelConfiguration":"4x4",
"vehicleEngine": "V-8 cyl",
"color": "Bright White",
"vehicleInteriorColor": "Diesel Gray/Black",
"fuelType": "Regular Unleaded",
"mileageFromOdometer": "60455",
"vehicleTransmission": "8 speed automatic",
"name": "Ram 1500 Truck Crew Cab",
"image": "https://images.dealer.com/autodata/us/large_stockphoto-color/2015/USC50RMT11CB0/PW7.jpg",
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "31000.0",
"availability": "http://schema.org/InStock"
}
};
tpsSchemaJson.push(vehicleSchema);
var hours = "[09:00 to 7:00pm-${pmtime}=Tu Mo Th, 09:00 to 4:00pm-${pmtime}=Sa, 09:00 to 5:00pm-${pmtime}=Fr We]".trim().replace(/]/g," ").replace(/[\[\]\[]/g,"").replace(/\=/g, " ").replace(/, /g, ",").trim().split(",");
var social = [];
var social = "https://www.facebook.com/PellegrinoAuto,https://www.youtube.com/channel/UCsVaRr3q6TVBeiIfgByQo7g".replace(/"/g, "").trim().split(",");
tpsSchemaJson = ( typeof tpsSchemaJson != 'undefined' && tpsSchemaJson instanceof Array ) ? tpsSchemaJson : [];
var autodealer = {
"@context" : "http://schema.org",
"@type" : "AutoDealer",
"openingHours" : hours,
"name" : "Pellegrino Auto Sales",
"url" : location.origin,
"address": {
"@type": "PostalAddress",
"addressLocality": "Batavia",
"addressRegion": "NY",
"postalCode": "14020",
"streetAddress": "4060 Pearl St Rd"
},
"image": "https://pictures.dealer.com/p/pellegrinoautosalesllc/1186/6c3181b62e95f47569cab0f5772980ddx.jpg",
"hasMap": "https://www.google.com/maps/place/Pellegrino Auto Sales/@42.9944888,-78.2148906,17z/data=!3m1!4b1!4m5!3m4!1s0x89d3edd202106ad7:0xf37ec17084302960!8m2!3d42.9944888!4d-78.2126966",
"description": "Used car dealership in Batavia, NY carries a wide variety of quality and affordable pre-owned vehicles from top makers like Chevrolet, Ford, Nissan, Toyota and more. Apply online for car loans or browse inventory now!",
"logo": "https://pictures.dealer.com/p/pellegrinoautosalesllc/1627/907d4e642e3374952183d6026dc0d492x.jpg",
"sameAs" : social,
"geo" : {
"@type" : "GeoCoordinates",
"latitude" : "42.994680",
"longitude" : "-78.212698"
},
"contactPoint": {
"@type": "ContactPoint",
"contactType": "Customer Service",
"telephone": " 15853442658"
},
"telephone": " 15853442658",
"priceRange": "Call for quote",
"areaServed": ["Batavia", " Le Roy", " Medina NY", " Buffalo", " Rochester", " NY"]
};
tpsSchemaJson.push(autodealer);
el.text = JSON.stringify(tpsSchemaJson);
console.log("DDC Schema.org code loaded.")
jQuery(function($) {
$('body').append(el);
});
$('[data-widget-id="template-header1"]').append($('.socialheader-header-container').removeClass('hidden').removeClass('hide'));
window.DDC = window.DDC || {};
DDC.dataLayer = (DDC.dataLayer || {});
DDC.dataLayer.site = (DDC.dataLayer.site || {});
DDC.dataLayer.site.siteInfo = (DDC.dataLayer.site.siteInfo || {});
DDC.dataLayer.site.siteInfo.vinLensAccountId = 19634;
window.DDC = window.DDC || {};
var trackerNames = [];
trackerNames.push('UA1436281301');
ga('create', {trackingId: 'UA-143628130-1', cookieDomain: 'auto', name: 'UA1436281301'});
ga(function() {
for (var i=0; i < trackerNames.length; i) {
var name = trackerNames[i];
ga(name '.send', 'pageview');
}
});
if( (Math.random() * 100) < 5 ) {
DDC.getScripts({ js: ['/v9/media/js/web-vitals-tracking/google-analytics/index.js'] });
}
if (jQuery.cookie !== undefined && (!jQuery.cookie('ddc_abc_cache') || jQuery.cookie('ddc_abc_cache') === '[object Object]' || !jQuery.cookie('ddc_abcamm_cache'))) {
$(function() {
jQuery.ajax({
url: "//pixall.esm1.net/cookie",
xhrFields: {
withCredentials: true
},
success: function (data) {
var eoCookieExists = !!jQuery.cookie('ddc_abc_cache');
var adCookies = ['abc', 'abcc', 'abcamm', 'abcg'];
for (var i = adCookies.length - 1; i >= 0; i--) {
var cookie = adCookies[i];
if (typeof data[cookie] !== undefined) {
var expirationDays = (data[cookie] === "") ? 1 : 7;
jQuery.cookie('ddc_' cookie '_cache', data[cookie], { expires: expirationDays, path: '/' });
}
}
if (data['abc']) {
sessionStorage.setItem("pixallCookieIsSet", true);
}
if (!eoCookieExists && jQuery.cookie('ddc_abc_cache')) {
window.DDC = window.DDC || {};
window.DDC.tracking = window.DDC.tracking || {};
window.DDC.tracking.ddc_abc_cache = data['abc'];
jQuery.publish('ddc-eo-cookies-set');
}
}
});
});
}
window.DDC = window.DDC || {};
window.DDC.i18n = window.DDC.i18n || {};
window.DDC.i18n.labels = Object.assign(window.DDC.i18n.labels || {}, {
'NO': 'No',
'OOPS_EXCLAMATION_YOU_MISSED_THIS_ONE': 'Oops\x21\x20You\x20missed\x20this\x20one.',
'PLEASE_CORRECT_THIS_VALUE': 'Please\x20correct\x20this\x20value.',
'PLEASE_ENTER_A_NUMERIC_VALUE': 'Please\x20enter\x20a\x20numeric\x20value.',
'PLEASE_ENTER_A_VALID_EMAIL_ADDRESS': 'Please\x20enter\x20a\x20valid\x20email\x20address.',
'PLEASE_ENTER_A_VALID_URL': 'Please\x20enter\x20a\x20valid\x20URL.',
'PLEASE_ENTER_A_VALUE_LARGER_THAN_ONE_DOLLAR': 'Please\x20enter\x20a\x20value\x20larger\x20than\x20\x241.',
'PLEASE_ENTER_A_VALUE_SMALLER_THAN_ONE_DOLLAR': 'Please\x20enter\x20a\x20value\x20smaller\x20than\x20\x241.',
'YOU_HAVE_XX_INVALID_ENTRIES_IN_THE_FORM': 'You\x20have\x20XX\x20invalid\x20entries\x20in\x20the\x20form',
'TCPA_CONSENT_ERROR_VERBIAGE': 'Your\x20consent\x20is\x20required\x20to\x20complete\x20this\x20action.\x20If\x20you\x20choose\x20to\x20not\x20opt\x2Din,\x20please\x20select\x20a\x20different\x20contact\x20method.\x20',
'VIDEOPLAYER_CAPTIONS_TITLE': 'Captions',
'VIDEOPLAYER_CAPTIONS_ON': 'On',
'VIDEOPLAYER_CAPTIONS_OFF': 'Off',
'VIDEOPLAYER_FULLSCREEN': 'Fullscreen',
'VIDEOPLAYER_QUALITY_TITLE': 'Quality',
'VIDEOPLAYER_QUALITY_HIGH': 'High',
'VIDEOPLAYER_QUALITY_LOW': 'Low',
'INVALID_DATE': 'Invalid\x20date'
});
};
jQuery(scripts);
/*]]>*/
</script>
<div data-location="page-fo
It should match the all the object like this:
{
"@context": "https://schema.org",
"@type": "Car",
"description": document.head.querySelector("[name=description]") ? document.head.querySelector("[name=description]").content : "",
"vehicleModelDate": "2015",
"manufacturer": "Ram",
"model": "1500",
"sku": "08c765ea0a0e0a922cefdf66496c54cd",
"bodyType": "Truck Crew Cab",
"itemCondition": "used",
"url": location.origin location.pathname,
"vehicleIdentificationNumber": "3C6RR7LT1FG710130",
"fuelEfficiency": ["16","23"],
"driveWheelConfiguration":"4x4",
"vehicleEngine": "V-8 cyl",
"color": "Bright White",
"vehicleInteriorColor": "Diesel Gray/Black",
"fuelType": "Regular Unleaded",
"mileageFromOdometer": "60455",
"vehicleTransmission": "8 speed automatic",
"name": "Ram 1500 Truck Crew Cab",
"image": "https://images.dealer.com/autodata/us/large_stockphoto-color/2015/USC50RMT11CB0/PW7.jpg",
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "31000.0",
"availability": "http://schema.org/InStock"
}
};
there will be multiple object with the same string, I want to match all the object, can anyone help?
CodePudding user response:
As with most regex problems, you'll have to make some (although educated) guesses. Assuming:
- the json object ends with a semicolon
- there aren't any dangling semicolons in the middle (almost never will be the case)
Then this should work:
r".*(\{.*?[\'\"]@context[\'\"]\s*:\s*[\'\"]https://schema\.org[\'\"].*?\}(?=[\n\r\s]*;\s*[\n\r])).*"
Used as follows:
# src = the entire input
regex = r".*(\{.*?[\'\"]@context[\'\"]\s*:\s*[\'\"]https://schema\.org[\'\"].*?\}(?=[\n\r\s]*;\s*[\n\r])).*"
print(re.findall(regex, src, flags=re.DOTALL)[0])
Which gave me:
{
"@context": "https://schema.org",
"@type": "Car",
"description": document.head.querySelector("[name=description]") ? document.head.querySelector("[name=description]").content : "",
"vehicleModelDate": "2015",
"manufacturer": "Ram",
"model": "1500",
"sku": "08c765ea0a0e0a922cefdf66496c54cd",
"bodyType": "Truck Crew Cab",
"itemCondition": "used",
"url": location.origin location.pathname,
"vehicleIdentificationNumber": "3C6RR7LT1FG710130",
"fuelEfficiency": ["16","23"],
"driveWheelConfiguration":"4x4",
"vehicleEngine": "V-8 cyl",
"color": "Bright White",
"vehicleInteriorColor": "Diesel Gray/Black",
"fuelType": "Regular Unleaded",
"mileageFromOdometer": "60455",
"vehicleTransmission": "8 speed automatic",
"name": "Ram 1500 Truck Crew Cab",
"image": "https://images.dealer.com/autodata/us/large_stockphoto-color/2015/USC50RMT11CB0/PW7.jpg",
"offers": {
"@type": "Offer",
"priceCurrency": "USD",
"price": "31000.0",
"availability": "http://schema.org/InStock"
}
}
Explanation
Since it'll be too complicated and practically unhelpful to explain this token by token, I'll explain the important pieces here:
[\'\"]
takes care of any type of quotation used
(?=[\n\r\s]*;\s*[\n\r])
ensures that the object is followed by some number of newlines or spaces, then a semicolon, and then a newline after it (basically, it should be the end of the JS expression)
re.DOTALL
ensures that the dot operator matches everything, including newlines