I've been trying to work this out, but I can only seem to get one URL to be exported to the output file.
The code I am currently using is...
import glob, re
with open('urls.txt', 'a') as output:
for file in glob.glob('json.txt'):
with open(file, 'r') as f:
for line in f.readlines():
pattern = r"(http|ftp|https):\/\/([\w_-] (?:(?:\.[\w_-] ) ))([\w.,@?^=%&:\/~ #-]*[\w@?^=%&\/~ #-])"
find = re.findall(pattern, line)
if find:
try:
output.write(str(find[0]))
except UnicodeEncodeError:
pass
I've tested the Regex code and it's able to match all the URLs, just won't output them all to a file.
The file I've been trying to extract URLs from contains the following (indented for legibility):
{
"items": [{
"schema": "Event",
"source_id": "99558834",
"event_id": "7103414757044987314",
"start_time": "2022-05-30T06:37:10Z",
"end_time": "2022-05-30T06:37:24Z",
"event_type": "motion",
"source_type": null,
"duration_ms": 14400,
"session_duration": 14000,
"state": "timed_out",
"had_subscription": true,
"is_favorite": false,
"recording_status": "ready",
"cv": {
"person_detected": true,
"stream_broken": false,
"detection_type": "human",
"cv_triggers": null,
"detection_types": [{
"detection_type": "human",
"verified_timestamps": [1653892632153]
}]
},
"properties": {
"is_alexa": false,
"is_sidewalk": false,
"is_autoreply": false
},
"origin": null,
"error_message": null,
"updated_at": "2022-05-30T06:37:28.958Z",
"visualizations": {
"cloud_media_visualization": {
"schema": "CloudMediaVisualization",
"media": [{
"schema": "Media",
"url": "https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/8cbfaccd-9b1a-458b-88b9-5d12976f4293.mp4?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=b42e734ab24ce1c8057038a171c995326de1a8cf219810c33c0d0883e2ea38b2",
"custom_metadata": null,
"is_e2ee": false,
"manifest_id": null,
"file_type": "VIDEO",
"file_family": "VIDEO",
"preroll_duration_ms": 0,
"playback_duration": 14000,
"source": "Apsara"
}, {
"schema": "Media",
"url": "https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/b22b3c85-5de3-4e91-92b5-d91db479df55.mp4?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=7724f211de257f1f13fb585158f0f241e47daa1f1f67a3e48527e45883889a8b",
"custom_metadata": null,
"is_e2ee": false,
"manifest_id": null,
"file_type": "LQ_VIDEO",
"file_family": "LQ_VIDEO",
"preroll_duration_ms": 0,
"playback_duration": 14400,
"source": "Apsara"
}, {
"schema": "Media",
"url": "https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/564fb900-0d78-4521-8a3d-b760fff7ee8d.iframe?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=8ccd9823cd6d2fe0e386b843a700bd05cc3a694c6986a55b75c797cbf846b7c6",
"custom_metadata": null,
"is_e2ee": false,
"manifest_id": null,
"file_type": "THUMBNAIL",
"file_family": "THUMBNAIL",
"preroll_duration_ms": 0,
"playback_duration": 14000,
"source": "Apsara"
}]
},
"local_media_visualization": {
"schema": "LocalMediaVisualization",
"media": []
},
"radar_visualization": null,
"single_coordinate_visualization": null,
"map_visualization": null
},
"device": {
"id": 99558834,
"description": "Front",
"type": "cocoa_camera"
},
"owner_id": "71616327"
}]
}
CodePudding user response:
I think it might be easier to make the data you have valid JSON and then use the object_hook
parameter that json.loads()
supports. For more details see my answer to How to find a particular JSON value by key?.
Here's how to apply it your data:
import json
def find_values(id, json_repr):
results = []
def _decode_dict(a_dict):
try:
if id == 'media': print(id)
results.append(a_dict[id])
except KeyError:
pass
return a_dict
json.loads(json_repr, object_hook=_decode_dict) # Return value ignored.
return results
with open('filename.json') as file:
jstr = file.read()
json_repr = jstr ']}' # Make jstr valid JSON.
results = find_values('url', json_repr)
print(f'{len(results)} URLs found')
for i, url in enumerate(results, start=1):
print(f'{i}: {url}')
Output:
3 URLs found
1: https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/8cbfaccd-9b1a-458b-88b9-5d12976f4293.mp4?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=b42e734ab24ce1c8057038a171c995326de1a8cf219810c33c0d0883e2ea38b2
2: https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/b22b3c85-5de3-4e91-92b5-d91db479df55.mp4?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=7724f211de257f1f13fb585158f0f241e47daa1f1f67a3e48527e45883889a8b
3: https://filestore-086356611853-us-west-2-prod-data.s3.us-west-2.amazonaws.com/564fb900-0d78-4521-8a3d-b760fff7ee8d.iframe?X-Amz-Security-Token=IQoJb3JpZ2luX2VjECcaCXVzLWVhc3QtMSJIMEYCIQCd/iqSm+FneYZ1sRxM1yNyc3Cr8bVV92jQRo6k+4A7pwIhAO4ufSc2Ol8wevIQBjAUZz+7+/ZrSgGpNtDhBH6hWlikKtIECB8QABoMNzM0NDEwMjU5OTMxIgyxJGK4nrZlY0QIGNQqrwTjz9YEN9G7vRk+u9qUDpVIrwzd2jNXuCJ92K+HVCpSQb8wFqg6+h521Ukotxvl9HXThrBDfgK4madk3/J1Gynn3M+Z7MJnpLu0uA9tUperBazYvaNzPgFWBS2kWSUObSO5Jfwn6L9VoB4D//HvOJa5pmDVXFc2s4hSkyxrXfw7W5OoBxdjKPU5TcdamZy7uJgLElZec/7PO99okNwIYQDS0RKKpcdZs3VbBiceXeb8ApDIcDWonMrnmz18Gz9wG+6ERrM6Av31UXID875c6DqfbqxCxpGpVXBlSy6jQENn+l+c5xewwhY4mTq90CcCZXnebCyoqkr2mt0S3lkZSBxdOI8qnoojCmg7yy+FII63h4NKQbEbhm2u1u/b1Ar5UfD4wHzsalhZp83Xej5Lsg0uXvpRCaYoR6mQgvnmVmS1bIFe0StzTHhJHViwEb4XbSK3u5Z/niVcBbVKsidNN9/A33okRPz7FMjpEaOB3lsbeTpmBcC86GlnwFxarYEvWY6eN7uxE0pzuK2asYgat5JqaNj/bRMaW1hi7ivGAj9uFZjMteTdrsNAq6lbLaiL1POhB98D0eJumvA1xu/bxoE7VrW+ikA2LOGwni5EAZ9LIzywxOHx9a5iiC+AFjwUGEzswdmzo0mAq0llNp1twfG5Bn47DHrUfF3NubD3aCA01mQ/SbKKBv+nMD6FK2yo9f8y2Ol/12/RLQMZbkA6i7TpaE7HNvj3ElWgwUp8OddeMPaD1ZQGOqgB4vDMx4xDOedv0RjNjZikdYtR2dHU3V4K9Ls2qUqF6NJ/rbvgwL1s4+m3ZMeOUmLfJDMazkWg8jSNRfKBWFParp2R0/g8TDUEOecwrbmN7cKG3vtnOpZIcFCD46bWvKm9czEun5zbNg6Q1rCLob5RTkEG6H0A729wvomQRldlb6QBtwAC0B7mfnRGgNZrEN3z0SSauZJS3mabSGhxwc0Oem6mFKK6s9Qh&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20220530T225422Z&X-Amz-SignedHeaders=host&X-Amz-Expires=900&X-Amz-Credential=ASIA2V7SDHXNTJG3BKDZ/20220530/us-west-2/s3/aws4_request&X-Amz-Signature=8ccd9823cd6d2fe0e386b843a700bd05cc3a694c6986a55b75c797cbf846b7c6
CodePudding user response:
As mentioned by others, there are better ways to parse/read json, but given your code, it could do what you want with a small tweak.
import glob, re
with open('urls.txt', 'a') as output:
for file in glob.glob('json.txt'):
with open(file, 'r') as f:
for line in f.readlines():
pattern = r"(http|ftp|https):\/\/([\w_-] (?:(?:\.[\w_-] ) ))([\w.,@?^=%&:\/~ #-]*[\w@?^=%&\/~ #-])"
find = re.findall(pattern, line)
if find:
try:
for result in find:
output.write(str(result) "\n")
except UnicodeEncodeError:
pass
You have only asked to output the first of the matched results (find[0]
). You want to get all of them, so loop through them then output them.