Jacek is best known by his Gitbooks:
scala> spark.catalog.listTables.show
+-----+--------+-----------+---------+-----------+
| name|database|description|tableType|isTemporary|
+-----+--------+-----------+---------+-----------+
| huge| default| null| MANAGED| false|
|large| default| null| MANAGED| false|
+-----+--------+-----------+---------+-----------+
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
val huge = spark.table("huge")
val large = spark.table("large")
huge.join(large, "id").foreach(_ => ())
scala> println(spark.version)
2.3.2
scala> println(spark.sessionState.conf.bucketingEnabled)
true
// Creating bucketed tables
import org.apache.spark.sql.SaveMode
large.write
.bucketBy(4, "id") // <-- bucketing
.sortBy("id") // <-- optional sorting
.mode(SaveMode.Overwrite)
.saveAsTable("bucketed_large")
huge.write
.bucketBy(4, "id") // <-- bucketing
.sortBy("id") // <-- optional sorting
.mode(SaveMode.Overwrite)
.saveAsTable("bucketed_huge")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
val bucketed_large = spark.table("bucketed_large")
val bucketed_huge = spark.table("bucketed_huge")
bucketed_large.join(bucketed_huge, "id").foreach(_ => ())
|
|
val describeSQL = sql("DESCRIBE EXTENDED bucketed_large")
describeSQL.show(numRows = 50, truncate = false)