Source code for rastertodataframe.rastertodataframe

# -*- coding: utf-8 -*-
import os
import logging
import tempfile
import uuid
import shutil

import numpy as np
import pandas as pd

from rastertodataframe import util, tiling

log = logging.getLogger(__name__)

[docs]def raster_to_dataframe(raster_path, vector_path=None): """Convert a raster to a Pandas DataFrame. Parameters ---------- raster_path : str Path to raster file. vector_path : str Optional path to vector file. If given, raster pixels will be extracted from features in the vector. If None, all raster pixels are converted to a DataFrame. Returns ------- pandas.core.frame.DataFrame """ # Placeholders for possible temporary files. temp_dir = vector_mask_fname = None # Get raster band names. ras = util.open_raster(raster_path) raster_band_names = util.get_raster_band_names(ras) # Create a mask from the pixels touched by the vector. if vector_path is not None: # Create a temporary directory for files. temp_dir = tempfile.mkdtemp() vec_with_fid = os.path.join(temp_dir, '{}'.format(uuid.uuid1())) # Add a dummy feature ID column to the vector. # This is not always present in OGR features. vec_gdf = util.open_vector(vector_path, with_geopandas=True) mask_values = list(range(1, len(vec_gdf) + 1)) vec_gdf['__fid__'] = pd.Series(mask_values) vec_gdf.to_file(vec_with_fid, driver='GeoJSON') # Mask the vector using the feature ID column. vector_mask_fname = os.path.join(temp_dir, '{}'.format(uuid.uuid1())) vector_mask = util.burn_vector_mask_into_raster( raster_path, vec_with_fid, vector_mask_fname, vector_field='__fid__') # Loop over mask values to extract pixels. tile_dfs = [] # DataFrames of each tile. mask_arr = vector_mask.GetRasterBand(1).ReadAsArray() for ras_arr in tiling.tiles(ras): mask_dfs = [] # DataFrames of each mask. for mask_val in mask_values: # Extract only masked pixels. pixels = util.get_pixels( ras_arr, mask_arr, mask_val=mask_val)\ .transpose() fid_px = np.ones(pixels.shape[0]) * mask_val # Create a DataFrame of masked pixels and their FID. mask_df = pd.DataFrame(pixels, columns=raster_band_names) mask_df['__fid__'] = fid_px mask_dfs.append(mask_df) # Concat the mask DataFrames. mask_df = pd.concat(mask_dfs) # Join with pixels with vector attributes using the FID. tile_dfs.append(mask_df.merge(vec_gdf, how='left', on='__fid__')) # Merge all the tiles. out_df = pd.concat(tile_dfs) else: # No vector given, simply load the raster. tile_dfs = [] # DataFrames of each tile. for ras_arr in tiling.tiles(ras): idx = (1, 2) # Assume multiband if ras_arr.ndim == 2: idx = (0, 1) # Handle single band rasters mask_arr = np.ones((ras_arr.shape[idx[0]], ras_arr.shape[idx[1]])) pixels = util.get_pixels(ras_arr, mask_arr).transpose() tile_dfs.append(pd.DataFrame(pixels, columns=raster_band_names)) # Merge all the tiles. out_df = pd.concat(tile_dfs) # TODO mask no data values. # Remove temporary files. if temp_dir is not None: shutil.rmtree(temp_dir, ignore_errors=True) # Return dropping any extra cols. return out_df.drop(columns=['__fid__', 'geometry'], errors='ignore')