Home > OS >  JQ: Filtering a massive json with nested arrays efficiently
JQ: Filtering a massive json with nested arrays efficiently

Time:05-01

I have a MASSIVE json file with lots of extraneous fields that I need to trim down to only include certain fields. I've made some headway on processing the file with jq but I've run into some difficulty when it comes to pulling information out of the nested arrays in the json file, and none of the solutions I've found have seemed to work for me.

My json data looks like this:

{
    "results": [
        {
            "url": "https://someresult.com",
            "id": 5192740,
            "external_id": null,
            "via": {
                "channel": "web",
                "source": {
                    "from": {},
                    "to": {},
                    "rel": null
                }
            },
            "created_at": "2022-04-29T15:19:37Z",
            "updated_at": "2022-04-29T15:19:38Z",
            "type": null,
            "subject": "My subject line",
            "raw_subject": "My subject line ",
            "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
            "priority": "normal",
            "status": "new",
            "recipient": "[email protected]",
            "requester_id": 1234567891,
            "submitter_id": 1234567891,
            "assignee_id": null,
            "organization_id": null,
            "group_id": 123456789,
            "collaborator_ids": [],
            "follower_ids": [],
            "email_cc_ids": [],
            "forum_topic_id": null,
            "problem_id": null,
            "has_incidents": false,
            "is_public": true,
            "due_at": null,
            "tags": [
                "_tag_1",
                "_tag_2",
                "_tag_3"
            ],
            "custom_fields": [
                {
                    "id": 1500010396161,
                    "value": null
                },
                {
                    "id": 360009431333,
                    "value": "Keep this data"
                },
                {
                    "id": 360054304553,
                    "value": false
                },
                {
                    "id": 1900000317745,
                    "value": null
                },
                {
                    "id": 360002223154,
                    "value": null
                },
                {
                    "id": 360009431353,
                    "value": "Keep this too"
                },
                {
                    "id": 1500001920482,
                    "value": "Keep this data, as well!"
                }
            ],
            "followup_ids": [],
            "ticket_form_id": 12345678912,
            "brand_id": 112358,
            "allow_channelback": false,
            "allow_attachments": true,
            "result_type": "ticket"
        },
        {
            "url": "https://anotherresult.com",
            "id": 5192741,
            "external_id": null,
            "via": {
                "channel": "web",
                "source": {
                    "from": {},
                    "to": {},
                    "rel": null
                }
            },
            "created_at": "2022-04-18T15:19:37Z",
            "updated_at": "2022-04-18T15:19:38Z",
            "type": null,
            "subject": "My other subject line",
            "raw_subject": "My other subject line ",
            "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
            "priority": "normal",
            "status": "new",
            "recipient": "[email protected]",
            "requester_id": 1234567892,
            "submitter_id": 1234567892,
            "assignee_id": null,
            "organization_id": null,
            "group_id": 123456780,
            "collaborator_ids": [],
            "follower_ids": [],
            "email_cc_ids": [],
            "forum_topic_id": null,
            "problem_id": null,
            "has_incidents": false,
            "is_public": true,
            "due_at": null,
            "tags": [
                "_tag_1",
                "_tag_2",
                "_tag_3"
            ],
            "custom_fields": [
                {
                    "id": 1500010396161,
                    "value": null
                },
                {
                    "id": 360009431333,
                    "value": "Keep this data"
                },
                {
                    "id": 360054304553,
                    "value": false
                },
                {
                    "id": 1900000317745,
                    "value": null
                },
                {
                    "id": 360002223154,
                    "value": null
                },
                {
                    "id": 360009431353,
                    "value": "Keep this too"
                },
                {
                    "id": 1500001920482,
                    "value": "Keep this data, as well!"
                }
            ],
            "followup_ids": [],
            "ticket_form_id": 12345678913,
            "brand_id": 112359,
            "allow_channelback": false,
            "allow_attachments": true,
            "result_type": "ticket"
        }
      ]
    }

And I want to trim it down to this:

{
    "results": [
        {
            "url": "https://someresult.com",
            "created_at": "2022-04-29T15:19:37Z",
            "subject": "My subject line",
            "description": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris et nulla ut sapien ultrices tempus.",
            "recipient": "[email protected]",
            "tags": [
                "_tag_1",
                "_tag_2",
                "_tag_3"
            ],
            "os": "Keep this data",
            "is_signed_in": false,
            "phone_model": "Keep this too",
            "channel": "Keep this data, as well!"
        }
    ]
}

So far, I've been able to brute force it by deleting fields by typing them all out

jq 'del(.results[] | .url, .id, .external_id, .via, .updated_at, .type, .raw_subject, .priority, .status, .requester_id, .submitter_id, .assignee_id, .organization_id, .group_id, .collaborator_ids, .follower_ids, .email_cc_ids, .problem_id, .has_incidents, .is_public, .due_at, .forum_topic_id, .satisfaction_rating, .sharing_agreement_ids, .fields, .followup_ids, .ticket_form_id, .allow_channelback, .allow_attachments, .result_type)'

but that feels absurd (still, it works). But when I try to delete or filter for the fields I want from the custom_fields array, I get stuck (Cannot index array with string "custom fields").

My question is twofold:

  1. Is there a cleaner way I could be selecting which fields to keep rather than specifying which to delete?
  2. How do I grab the fields I need from the nested array, and flattening them to the same level as the rest of the fields, while also renaming them?

CodePudding user response:

The following illustrates one approach to answering the twofold question:

.results |= map(
  (.custom_fields | map(.value | select(. != null))) as $values
  | {url, created_at, subject, description, recipient, tags, 
     os: $values[0],
     is_signed_in: $values[1],
     phone_model: $values[2],
     channel: $values[3] } )

This however results in .results having length 2.

CodePudding user response:

You can use a mapping to keep track of the custom fields wanted :

#!/usr/bin/env bash

jq '{"os"          : 360009431333,
     "is_signed_in": 360054304553,
     "phone_model" : 360009431353,
     "channel"     : 1500001920482} as $mapping |
map_values(map(
   { url,created_at,subject,description,recipient,tags,
   }  
   (.custom_fields as $custom_fields |
    $mapping |
    with_entries(
      .value |= (. as $id | $custom_fields[]|select(.id==$id).value)
    )
   )
))' input.json
  • Related