Source code for swisslandstats.dataframe

from __future__ import division

import numpy as np
import pandas as pd
import rasterio
from rasterio.transform import from_origin

from . import geometry as sls_geometry
from . import plotting, settings

__all__ = ['LandDataFrame', 'merge', 'read_csv']

_merge_doc = """
Merges LandDataFrame objects
The default parameter values will do an outer join using the indices of both
dataframes as join keys, and will avoid duplicating columns.
See also the documentation for `pandas.merge`

Parameters
----------%s
right : LandDataFrame
duplicate_columns : boolean, default True
how : {'left', 'right', 'outer', 'inner'}, default 'outer'
    parameter passed to `pandas.merge`
left_index : boolean, default True
    parameter passed to `pandas.merge`
right_index : boolean, default True
    parameter passed to `pandas.merge`
**kwargs : additional keyord arguments passed to `pandas.merge`

Returns
-------
result : LandDataFrame
"""


[docs]class LandDataFrame(pd.DataFrame):
    """
    A LandDataFrame object is a pandas.DataFrame extended to deal with the land
    statistics files provided by the Swiss Federal Statistical Office (SFSO).
    Each row of a SLSDataFrame represents a pixel of a raster landscape, with
    the 'x' and 'y' that depict the centroid of the pixel, as well as a set of
    land use/land cover (LULC) information columns.

    The default parameters are defined to work with SFSO data out-of-the-box,
    but they can be modified through the following keyword arguments:

    Parameters
    ----------
    data : ndarray (structured or homogeneous), Iterable, dict or DataFrame
        Data that will be passed to the initialization method of `pd.DataFrame`
    index_column : str, optional
        Label of the index column. If `None` is provided, the value set in
        `settings.DEFAULT_INDEX_COLUMN` will be taken.    
    x_column : str, optional
        Label of the x-coordinates column. If `None` is provided, the value
        set in `settings.DEFAULT_X_COLUMN` will be taken.    
    y_column : str, optional
        Label of the y-coordinates column. If `None` is provided, the value
        set in `settings.DEFAULT_Y_COLUMN` will be taken.    
    crs : rasterio CRS, optional
        Coordinate reference system, as a rasterio CRS object. If `None` is
        provided, the value set in `settings.DEFAULT_CRS` will be taken.
    res : tuple, optional
        The (x, y) resolution of the dataset. If `None` is provided, the value
        set in `settings.DEFAULT_RES` will be taken.
    """

    # so that pandas can allow setting this class attributes
    _metadata = ['x_column', 'y_column', 'crs', 'res']

    # index_column = settings.DEFAULT_INDEX_COLUMN

    def __init__(self, data, index_column=None, x_column=None, y_column=None,
                 crs=None, res=None, **df_init_kws):
        # init the pandas dataframe
        super(LandDataFrame, self).__init__(data, **df_init_kws)

        # set the index
        if index_column is None:
            index_column = settings.DEFAULT_INDEX_COLUMN
        if self.index.name != index_column:
            self.set_index(index_column, inplace=True)

        # set the rest of attributes
        if x_column is None:
            x_column = settings.DEFAULT_X_COLUMN
        if y_column is None:
            y_column = settings.DEFAULT_Y_COLUMN
        if crs is None:
            crs = settings.DEFAULT_CRS
        if res is None:
            res = settings.DEFAULT_RES

        self.x_column = x_column
        self.y_column = y_column
        self.crs = crs
        self.res = res

    def get_transform(self):
        x = self[self.x_column].values
        y = self[self.y_column].values

        xres, yres = self.res

        x_origin = min(x) - xres // 2
        y_origin = max(y) + yres // 2
        return from_origin(x_origin, y_origin, xres, yres)

[docs]    def to_ndarray(self, column, nodata=0, dtype=np.uint8):
        """
        Convert a LULC column to a numpy array

        Parameters
        ----------
        column : str
            name of the LULC column
        nodata : numeric
            value to be assigned to pixels with no data
        dtype : str or numpy dtype
            the data type

        Returns
        -------
        lulc_arr : np.ndarray
            A LULC array
        """
        x = self[self.x_column].values
        y = self[self.y_column].values
        z = self[column].values

        xres, yres = self.res
        i = (y - min(y)) // yres
        j = (x - min(x)) // xres

        lulc_arr = np.full((i.max() + 1, j.max() + 1), np.nan)
        lulc_arr[-i, j] = z
        lulc_arr[np.isnan(lulc_arr)] = nodata

        return lulc_arr.astype(dtype)

[docs]    def to_geotiff(self, fp, column, nodata=0, dtype=rasterio.uint8):
        """
        Export a LULC column to a GeoTIFF file

        Parameters
        ----------
        fp : str, file object or pathlib.Path object
            A filename or URL, a file object opened in binary ('rb') mode,
            or a Path object.
        column : str
            name of the LULC column
        nodata : numeric
            value to be assigned to pixels with no data
        dtype : str or numpy dtype
            the data type
        """
        lulc_arr = self.to_ndarray(column, nodata, dtype)

        with rasterio.open(fp, 'w', driver='GTiff', height=lulc_arr.shape[0],
                           width=lulc_arr.shape[1], count=1, dtype=str(dtype),
                           nodata=0, crs=self.crs,
                           transform=self.get_transform()) as raster:
            raster.write(lulc_arr.astype(dtype), 1)

[docs]    def plot(self, column, cmap=None, legend=False, figsize=None, ax=None,
             **show_kws):
        # TODO: automatically assign cmaps according to columns
        lulc_arr = self.to_ndarray(column)
        return plotting.plot_ndarray(lulc_arr, transform=self.get_transform(),
                                     cmap=cmap, legend=legend, figsize=figsize,
                                     ax=ax, **show_kws)

    plot.__doc__ = plotting._plot_ndarray_doc % (
        'column', '\ncolumn : str\n    data column to display')

[docs]    def clip_by_geometry(self, geometry, geometry_crs=None):
        return sls_geometry.clip_by_geometry(self, geometry,
                                             geometry_crs=geometry_crs)

    clip_by_geometry.__doc__ = sls_geometry._clip_by_geometry_doc % ''

[docs]    def clip_by_nominatim(self, query, which_result=1):
        return sls_geometry.clip_by_nominatim(self, query,
                                              which_result=which_result)

    clip_by_nominatim.__doc__ = sls_geometry._clip_by_nominatim_doc % ''

    # pandas methods
    def __getitem__(self, key):
        result = super(LandDataFrame, self).__getitem__(key)
        if isinstance(result, pd.DataFrame):
            # TODO: check that there is at least one column of land statistics
            if self.x_column in result and self.y_column in result:
                result.__class__ = LandDataFrame
                result.crs = self.crs
                result.res = self.res
            else:
                result.__class__ = pd.DataFrame
        return result

[docs]    def merge(self, right, duplicate_columns=False, how='outer',
              left_index=True, right_index=True, **kwargs):
        return merge(self, right, **kwargs)

    merge.__doc__ = _merge_doc % ''

    @property
    def _constructor(self):
        return LandDataFrame

    # geopandas
[docs]    def get_geoseries(self):
        return sls_geometry.get_geoseries(self)

    get_geoseries.__doc__ = sls_geometry._get_geoseries_doc % ''

[docs]    def to_geodataframe(self, drop_xy_columns=True):
        return sls_geometry.to_geodataframe(self,
                                            drop_xy_columns=drop_xy_columns)

    to_geodataframe.__doc__ = sls_geometry._to_geodataframe_doc % ''


[docs]def merge(left, right, duplicate_columns=False, how='outer', left_index=True,
          right_index=True, **kwargs):

    if duplicate_columns:
        _right = right
    else:
        _right = right[right.columns.difference(left.columns)]

    return pd.merge(left, _right, how=how, left_index=left_index,
                    right_index=right_index, **kwargs)


merge.__doc__ = _merge_doc % '\nleft : LandDataFrame'


[docs]def read_csv(filepath_or_buffer, index_column=None, x_column=None,
             y_column=None, crs=None, res=None, read_csv_kws=None,
             df_init_kws=None):
    """
    Convert a LULC column to a numpy array. See also the documentation for
    `pandas.read_csv`.

    Parameters
    ----------
    filepath_or_buffer : str, pathlib.Path, py._path.local.LocalPath or any \
    object with a read() method (such as a file handle or StringIO)
        The string could be a URL. Valid URL schemes include http, ftp, s3, and
        file. For file URLs, a host is expected. For instance, a local file
        could be file://localhost/path/to/table.csv
    index_column : str, optional
        Label of the index column. If `None` is provided, the value set in
        `settings.DEFAULT_INDEX_COLUMN` will be taken.    
    x_column : str, optional
        Label of the x-coordinates column. If `None` is provided, the value
        set in `settings.DEFAULT_X_COLUMN` will be taken.    
    y_column : str, optional
        Label of the y-coordinates column. If `None` is provided, the value
        set in `settings.DEFAULT_Y_COLUMN` will be taken.    
    crs : rasterio CRS, optional
        Coordinate reference system, as a rasterio CRS object. If `None` is
        provided, the value set in `settings.DEFAULT_CRS` will be taken.
    res : tuple, optional
        The (x, y) resolution of the dataset. If `None` is provided, the value
        set in `settings.DEFAULT_RES` will be taken.
    read_csv_kws : dict-like, optional
        Keyword arguments to be passed to `pandas.read_csv`
    df_init_kws : dict-like, optional
        Keyword arguments to be passed to `pandas.read_csv`

    Returns
    -------
    result : LandDataFrame
    """
    if read_csv_kws is None:
        read_csv_kws = {}
    df = pd.read_csv(filepath_or_buffer, **read_csv_kws)

    if df_init_kws is None:
        df_init_kws = {}
    return LandDataFrame(df, crs=crs, res=res, index_column=index_column,
                         x_column=x_column, y_column=y_column, **df_init_kws)