spark .read .format("XXX") // <-- your custom data source / format .option("header", true) // <-- the data source supports options .load("data.xxx") .select("id", "name") // column pruning .where($"id" > 5) // filter pushdown .show
dataframe .write .format("XXX") // <-- your custom data source / format .option("header", true) // <-- the data source supports options .mode("overwrite") // modes .save("data.xxx")
abstract class BaseRelation { def sqlContext: SQLContext def schema: StructType def sizeInBytes: Long = sqlContext.conf.defaultSizeInBytes def needConversion: Boolean = true def unhandledFilters(filters: Array[Filter]): Array[Filter] = filters }
trait PrunedScan { def buildScan(requiredColumns: Array[String]): RDD[Row] }
trait PrunedFilteredScan { def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] }
trait TableScan { def buildScan(): RDD[Row] }