Home > Blockchain >  Merging and deleting duplicate items within a list of dictionaries
Merging and deleting duplicate items within a list of dictionaries

Time:12-02

I have a list of dictionaries

[{'elementid': 'BsWfsElement.1.1', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86400', 'lat': '32.05570', 'paramname': 'multiplicity', 'paramvalue': '4'}, {'elementid': 'BsWfsElement.1.2', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86400', 'lat': '32.05570', 'paramname': 'peak_current', 'paramvalue': '-11'}, {'elementid': 'BsWfsElement.1.3', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86400', 'lat': '32.05570', 'paramname': 'cloud_indicator', 'paramvalue': '0'}, {'elementid': 'BsWfsElement.1.4', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86400', 'lat': '32.05570', 'paramname': 'ellipse_major', 'paramvalue': '5.8'}, {'elementid': 'BsWfsElement.2.1', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86350', 'lat': '32.02770', 'paramname': 'multiplicity', 'paramvalue': '0'}, {'elementid': 'BsWfsElement.2.2', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86350', 'lat': '32.02770', 'paramname': 'peak_current', 'paramvalue': '-16'}, {'elementid': 'BsWfsElement.2.3', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86350', 'lat': '32.02770', 'paramname': 'cloud_indicator', 'paramvalue': '0'}, {'elementid': 'BsWfsElement.2.4', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86350', 'lat': '32.02770', 'paramname': 'ellipse_major', 'paramvalue': '1.6'}, {'elementid': 'BsWfsElement.3.1', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86730', 'lat': '32.07100', 'paramname': 'multiplicity', 'paramvalue': '0'}, {'elementid': 'BsWfsElement.3.2', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86730', 'lat': '32.07100', 'paramname': 'peak_current', 'paramvalue': '-35'}, {'elementid': 'BsWfsElement.3.3', 'obstime': '2022-07-11T20:00:05', 'lon': '59.86730', 'lat': '32.07100', 'paramname': 'cloud_indicator', 'paramvalue': '0'}, {'elementid': 'BsWfsElement.3.4'

which I want to group by the id subsection in the key elementid, in a way that appends the paramname and paramvalue values from dictionaries that have the .2, .3 and .4 to the "first" dictionary that has the .1 ending, since every other item in the .2, .3 and .4 dictionaries are duplicates. When this would be done, I'd remove the elementid item and combine the paramname and paramvalue items.

So an example of my desired output in the end would then be

[{'obstime': '2022-07-11T20:00:05', 'lon': '59.86400', 'lat': '32.05570', 'multiplicity': '4', 'peak_current': '-11', 'cloud_indicator': '0', 'ellipse_major': '58'} ... ]

My code that creates the list of dictionaries from an XML file

from urllib.request import urlopen
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import fromstring, ElementTree
from itertools import groupby
from operator import itemgetter

file = urlopen('https://opendata.fmi.fi/wfs?service=WFS&version=2.0.0&request=getFeature&storedquery_id=fmi::observations::lightning::simple&timestep=1&starttime=2022-07-11T20:00:00Z&endtime=2022-07-11T20:05:00Z')
data = file.read()
tree = ElementTree(fromstring(data))
root = tree.getroot()

paramnames = []
paramvalues = []
lon = []
lat = []
obstime = []
ids = []

ET.register_namespace('wfs', "http://www.opengis.net/wfs/2.0")
ET.register_namespace('gml', "http://www.opengis.net/gml/3.2")
ET.register_namespace('BsWfs', "http://xml.fmi.fi/schema/wfs/2.0")
ET.register_namespace('xsi', "http://www.w3.org/2001/XMLSchema-instance") 

for pn in root.findall('.//{http://xml.fmi.fi/schema/wfs/2.0}ParameterName'):
    pnstr = (pn.text.replace('', ''))
    paramnames.append(pnstr)
for pv in root.findall('.//{http://xml.fmi.fi/schema/wfs/2.0}ParameterValue'):
    pvstr = (pv.text.replace('', ''))
    paramvalues.append(pvstr)
for ps in root.findall('.//{http://www.opengis.net/gml/3.2}pos'):
    psstr = (ps.text.replace('', ''))
    lons = psstr.split(None, 1)
    del lons[-1]
    lats = psstr.split(None, 2)
    del lats [-0]
    lon.append(lons[0])
    lat.append(lats[0])
for tm in root.findall('.//{http://xml.fmi.fi/schema/wfs/2.0}Time'):
    tmstr = (tm.text.replace('Z', ''))
    obstime.append(tmstr)
for i in root.findall('.//{http://xml.fmi.fi/schema/wfs/2.0}BsWfsElement'):
    idstr = i.get("{http://www.opengis.net/gml/3.2}id")
    ids.append(idstr)
zippedlist = list(zip(ids, obstime, lon, lat, paramnames, paramvalues)))
dictnames = ('elementid', 'obstime', 'lon', 'lat', 'paramname', 'paramvalue')
list_of_dicts = [dict(zip(dictnames,l)) for l in zippedlist]
print(list_of_dicts)

I tried sorting them by the lon item, but found out that it actually doesn't produce the result I wanted

list_of_dicts = sorted(list_of_dicts,
                    key = itemgetter('lon'))

for key, value in groupby(list_of_dicts,
                         key = itemgetter('lon')):
    for k in value:
        print(k)
print(list_of_dicts)

Output:

{'elementid': 'BsWfsElement.250.1', 'obstime': '2022-07-11T20:02:42', 'lon': '55.16820', 'lat': '30.88440', 'paramname': 'multiplicity', 'paramvalue': '1'}
{'elementid': 'BsWfsElement.250.2', 'obstime': '2022-07-11T20:02:42', 'lon': '55.16820', 'lat': '30.88440', 'paramname': 'peak_current', 'paramvalue': '21'}
{'elementid': 'BsWfsElement.250.3', 'obstime': '2022-07-11T20:02:42', 'lon': '55.16820', 'lat': '30.88440', 'paramname': 'cloud_indicator', 'paramvalue': '0'}
{'elementid': 'BsWfsElement.250.4', 'obstime': '2022-07-11T20:02:42', 'lon': '55.16820', 'lat': '30.88440', 'paramname': 'ellipse_major', 'paramvalue': '2.8'}
{'elementid': 'BsWfsElement.240.1', 'obstime': '2022-07-11T20:02:40', 'lon': '55.67710', 'lat': '31.12120', 'paramname': 'multiplicity', 'paramvalue': '1'}
{'elementid': 'BsWfsElement.240.2', 'obstime': '2022-07-11T20:02:40', 'lon': '55.67710', 'lat': '31.12120', 'paramname': 'peak_current', 'paramvalue': '109'}
{'elementid': 'BsWfsElement.240.3', 'obstime': '2022-07-11T20:02:40', 'lon': '55.67710', 'lat': '31.12120', 'paramname': 'cloud_indicator', 'paramvalue': '0'}
{'elementid': 'BsWfsElement.240.4', 'obstime': '2022-07-11T20:02:40', 'lon': '55.67710', 'lat': '31.12120', 'paramname': 'ellipse_major', 'paramvalue': '1.6'}
...

CodePudding user response:

from collections import defaultdict

combined_elements = defaultdict(dict)
for element in elements:
    # get values
    elementid = element['elementid'].rsplit('.',1)[0]
    paramname = element['paramname']
    paramvalue = element['paramvalue']
    # remove keys
    for key in ['elementid','paramname','paramvalue']:
        element.pop(key)
    # add to combined dict
    element.update({paramname:paramvalue})
    combined_elements[elementid].update(element)
    
# print elements
for elem in combined_elements.values():
    print(elem)

I used your first list as elements. The combined_elements still has the elementids as keys (without the last .x part) so you can refer to them if you want.

Outputs:

{'obstime': '2022-07-11T20:02:42', 'lon': '55.16820', 'lat': '30.88440', 'multiplicity': '1', 'peak_current': '21', 'cloud_indicator': '0', 'ellipse_major': '2.8'}
{'obstime': '2022-07-11T20:02:40', 'lon': '55.67710', 'lat': '31.12120', 'multiplicity': '1', 'peak_current': '109', 'cloud_indicator': '0', 'ellipse_major': '1.6'}

CodePudding user response:

import re
import json

tmp={}
for x in data:
    x['elementid']=re.sub(r'\.[0-9] $', '', x['elementid'])
    idx = json.dumps({k: v for k,v in sorted(x.items()) if k not in ['paramname', 'paramvalue']})
    try:
        tmp[idx].append({x['paramname']: x['paramvalue']})
    except KeyError:
        tmp[idx]=[{x['paramname']: x['paramvalue']}]

ouput=[{**json.loads(k), **{k:v for x in list(tmp.values())[0] for k,v in x.items()}} for k,v in tmp.items()]

returns:

[{'elementid': 'BsWfsElement',
  'lat': '32.05570',
  'lon': '59.86400',
  'obstime': '2022-07-11T20:00:05',
  'multiplicity': '4',
  'peak_current': '-11',
  'cloud_indicator': '0',
  'ellipse_major': '5.8'},
 {'elementid': 'BsWfsElement',
  'lat': '32.02770',
  'lon': '59.86350',
  'obstime': '2022-07-11T20:00:05',
  'multiplicity': '4',
  'peak_current': '-11',
  'cloud_indicator': '0',
  'ellipse_major': '5.8'},
 {'elementid': 'BsWfsElement',
  'lat': '32.07100',
  'lon': '59.86730',
  'obstime': '2022-07-11T20:00:05',
  'multiplicity': '4',
  'peak_current': '-11',
  'cloud_indicator': '0',
  'ellipse_major': '5.8'}]
  • Related