I am new to geopandas and to begin with I am trying to replicate the example in the link guide with some shapefiles of my own to work in pyspark.
https://ncar.github.io/PySpark4Climate/tutorials/pyspark-geo-analysis/geopandas-and-spark/
import os.path, json, io
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = (16, 20)
from retrying import retry # for exponential back down when calling TurboOverdrive API
import pyspark.sql.functions as func # resuse as func.coalace for example
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType,DecimalType
from pyspark.sql import SparkSession
import pandas as pd
from geopandas import GeoDataFrame # Loading boundaries Data
from shapely.geometry import Point, Polygon, shape # creating geospatial data
from shapely import wkb, wkt # creating and parsing geospatial data
from ast import literal_eval as make_tuple # used to decode data from java
# Create SparkSession and attach Sparkcontext to it
spark = SparkSession.builder.appName("pyspark-geopandas").getOrCreate()
sc = spark.sparkContext
# Load the boundaries data
geo_df = GeoDataFrame.from_file('it_1km.shp')
geo_df.head()
geo_df.columns
geo_df.plot(column='PUNTI', categorical=True, legend=True)
plt.show()
wkts = map(lambda g: g.to_wkt() , geo_df.geometry)
#wkts[0]
type(geo_df.geometry)
geo_df.crs=('epsg:4326')
geo_df.crs
geo_df.geometry.area
wkts = map(lambda g: g.to_wkt() , geo_df.geometry)
type(geo_df.geometry)
geo_df.crs=('epsg:4326')
geo_df.crs
geo_df.geometry.area
geo_df['wkt'] = pd.Series(
map(lambda geom: str(geom.to_wkt()), geo_df['geometry']),
index=geo_df.index, dtype='string')
However, the code always crashes on this error.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-f03fdc1f586f> in <module>
1 geo_df['wkt'] = pd.Series(
2 map(lambda geom: str(geom.to_wkt()), geo_df['geometry']),
----> 3 index=geo_df.index, dtype='string')
/opt/conda/lib/python3.7/site-packages/pandas/core/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
277 data = data.to_dense()
278 else:
--> 279 data = com.maybe_iterable_to_list(data)
280
281 if index is None:
/opt/conda/lib/python3.7/site-packages/pandas/core/common.py in maybe_iterable_to_list(obj)
278 """
279 if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
--> 280 return list(obj)
281 return obj
282
<ipython-input-14-f03fdc1f586f> in <lambda>(geom)
1 geo_df['wkt'] = pd.Series(
----> 2 map(lambda geom: str(geom.to_wkt()), geo_df['geometry']),
3 index=geo_df.index, dtype='string')
AttributeError: 'Polygon' object has no attribute 'to_wkt'
I work on Jupyter and the Python version is 3.7.6. It also gives the same error with other shapefiles and even with other shapes besides Polygon, such as Point or Polygon Z.
CodePudding user response:
Polygon and Point objects are shapely geometry objects, not geopandas, and have .wkt
attributes, not .to_wkt()
methods. See the shapely docs for more info.
So your last block of code should be:
geo_df['wkt'] = pd.Series(
map(lambda geom: geom.wkt, geo_df['geometry']),
index=geo_df.index, dtype='string')
However, geopandas.GeoSeries objects do have vectorized .to_wkt()
methods - your code could be re-written without an apply:
geo_df['wkt'] = geo_df.to_wkt()