Python XML Sax Truncates String with no Special Characters-CodePudding

I downloaded some US census area file in KML format. You can download the file here. I am trying to grab the area name and the coordinate boundaries. For some reason, some of the coordinate fields are truncated and not read correctly. For example, the coordinates for "Bloomsburg-Berwick-Sunbury, PA" appears in the KML file as

<coordinates>-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.008418,40.659912,0.0 -76.996995,40.635778,0.0 -76.965528,40.647149,0.0 -76.944828,40.650209,0.0 -76.939883,40.638142,0.0 -76.949148,40.628167,0.0 -76.918672,40.603466,0.0 -76.886411,40.617758,0.0 -76.864254,40.627585,0.0 -76.840104,40.625439,0.0 -76.810269,40.634526,0.0 -76.810044,40.640102,0.0 -76.804867,40.646839,0.0 -76.793851,40.640514,0.0 -76.745894,40.654464,0.0 -76.701624,40.658082,0.0 -76.700546,40.663114,0.0 -76.662137,40.674013,0.0 -76.562175,40.709007,0.0 -76.469523,40.743188,0.0 -76.380334,40.775445,0.0 -76.30717,40.801809,0.0 -76.2991,40.831191,0.0 -76.284611,40.883588,0.0 -76.207827,40.94974,0.0 -76.231194,41.050168,0.0 -76.228975,41.138466,0.0 -76.277639,41.131804,0.0 -76.317953,41.205453,0.0 -76.319957,41.211255,0.0 -76.310261,41.310198,0.0 -76.407934,41.308418,0.0 -76.447597,41.275629,0.0 -76.592607,41.157765,0.0 -76.640767,41.155718,0.0 -76.678776,41.154172,0.0 -76.732672,41.17204,0.0 -76.790807,41.175732,0.0 -76.828168,41.16578,0.0 -76.880963,41.158044,0.0 -76.884245,41.157099,0.0 -76.885228,41.155973,0.0 -76.888145,41.153807,0.0 -76.889338,41.151988,0.0 -76.889669,41.150791,0.0 -76.896114,41.13907,0.0 -76.960229,41.148801,0.0 -76.977939,41.087883,0.0 -77.058088,41.085575,0.0 -77.113839,41.069032,0.0 -77.144111,41.06884,0.0 -77.14416,41.044338,0.0 -77.204027,40.99271,0.0 -77.279236,40.90971,0.0 -77.36418,40.846937,0.0</coordinates>

But is truncated at character 297 out of 1664. This happens seemingly randomly for others as well. Size doesn't seem to be an issue.

['-77.36418,40.846937,0.0 -77.357113,40.844484,0.0 -77.356628,40.807334,0.0 -77.354097,40.701667,0.0 -77.287941,40.693595,0.0 -77.150516,40.677074,0.0 -77.109453,40.691552,0.0 -77.093607,40.676121,0.0 -77.060451,40.679854,0.0 -77.035549,40.676918,0.0 -77.034409,40.659928,0.0 -77.00841']

I tried on two different ec2 machines so I don't think it's a memory/hardware issue. Any idea what is going on?

from xml.sax.handler import ContentHandler
from xml.sax import parse
class KMLHandler(ContentHandler):
    def __init__(self):
        super().__init__()
        self.place_names = []
        self.current_name = None
        self.coordinates = []
        self.temp_coordinates = []
        self.start_placemark = False
        self.capture_place_name = False
        self.capture_cordinates = False
        self.mapping_dict = {}

    def startElement(self, name, attrs):
        if name == 'Placemark':
            self.first_placemark = True
            self.start_placemark = True
            self.temp_coordinates = []
            self.current_name = None
        else:
            pass
        if name == "SimpleData":
            if attrs['name'] == "NAME":
                self.capture_place_name = True
        if name == "coordinates":
            self.capture_cordinates = True

    def endElement(self, name):
        if name == "Placemark":
            self.start_placemark = False
            self.coordinates.append(self.temp_coordinates)
            self.mapping_dict[self.current_name] = self.temp_coordinates

    def characters(self, content):
        if content.strip() != "":
            if self.capture_place_name == True:
                self.place_names.append(content)
                self.current_name = content
                self.capture_place_name = False
            if self.capture_cordinates == True:
                str_vals = [x.split(',')[0:2] for x in content.split(' ')]
                self.temp_coordinates.append(content)
                self.capture_cordinates = False

fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'

handler = KMLHandler()
parse(fname, handler)

CodePudding user response：

As indicated in the comments, each characters event returns a chunk, which may or may not be the entire tag contents. It's similar to reading from a network; you might not get everything at once.

I reworked your code below, and it seems to report the right answer for Berwick. On my machine, the first chunk is 283 characters and the 2nd chunk is 1353 characters. 283 1353 = 1636, which matches the size of the data in the file.

Instead of a set of Booleans, I think it's simpler to capture the tag name, and then test for that when you're processing characters. There's only one controlling value, and it's set & reset in one place.

I didn't see a need for temp_coordinates. It wasn't clear to me whether you want coordinates to be a list or what, exactly, so I just grab the string.

from xml.sax import parse
class KMLHandler(ContentHandler):
    def __init__(self):
        super().__init__()
        self.place_names = []
        self.current_name = None
        self.coordinates = []
        self.start_placemark = False
        self.capture_place_name = False
        self.mapping_dict = {}
        self.capture = ''

    def startElement(self, name, attrs):
        self.capture = ''
        if name == 'Placemark':
            self.first_placemark = True
            self.start_placemark = True
            self.current_name = None
        else:
            pass
        if name == "SimpleData":
            if attrs['name'] == "NAME":
                self.capture = name
        if name == "coordinates":
            self.capture = name

    def endElement(self, name):
        if name == "Placemark":
            self.start_placemark = False
            self.mapping_dict[self.current_name] = self.coordinates
            self.coordinates = []

    def characters(self, content):
        if content.strip() != "":
            if self.capture == 'SimpleData':
                self.place_names.append(content)
                self.current_name = content
                self.capture_place_name = False
            if self.capture == "coordinates":
                self.coordinates.append(content)
                print( '%d coordinates for %s: {%s}' % (len(content),
                                                        self.current_name,
                                                        self.coordinates) )

fname='./cb_2020_us_csa_5m.kml'
# fname='./test_small2.kml'

handler = KMLHandler()
parse(fname, handler)