Once you have your table path, reading shared data is simple in any language.
Python
Scala
Java
R
SQL
# Define the table pathtable_path = "<profile-file-path>#<share-name>.<schema-name>.<table-name>"# Load the shared table as a DataFramedf = spark.read.format("deltaSharing").load(table_path)# Display the datadf.show()# Perform operationsdf.filter(df.age > 21).select("name", "age").show()
# Using the example Delta Sharing servertable_path = "/tmp/open-datasets.share#delta_sharing.default.boston-housing"df = spark.read.format("deltaSharing").load(table_path)print(f"Rows: {df.count()}, Columns: {len(df.columns)}")df.printSchema()
// Define the table pathval tablePath = "<profile-file-path>#<share-name>.<schema-name>.<table-name>"// Load the shared table as a DataFrameval df = spark.read.format("deltaSharing").load(tablePath)// Display the datadf.show()// Perform operationsdf.filter($"age" > 21).select("name", "age").show()
# Using the example Delta Sharing servertable_path <- "/tmp/open-datasets.share#delta_sharing.default.boston-housing"df <- read.df(table_path, "deltaSharing")cat("Rows:", nrow(df), "Columns:", ncol(df), "\n")printSchema(df)
-- First, create a table referenceCREATE TABLE shared_tableUSING deltaSharing LOCATION '<profile-file-path>#<share-name>.<schema-name>.<table-name>';-- Now you can query it like any other tableSELECT * FROM shared_table;-- Perform operationsSELECT name, age FROM shared_table WHERE age > 21;
-- Using the example Delta Sharing serverCREATE TABLE boston_housingUSING deltaSharing LOCATION '/tmp/open-datasets.share#delta_sharing.default.boston-housing';SELECT COUNT(*) as row_count FROM boston_housing;DESCRIBE boston_housing;
Once loaded, shared tables behave like regular Spark DataFrames:
# Standard DataFrame operations work seamlesslydf = spark.read.format("deltaSharing").load(table_path)# Aggregationsdf.groupBy("category").count().show()# Joins with other DataFrameslocal_df = spark.read.parquet("/path/to/local/data")joined = df.join(local_df, "id")# Write results to storagedf.filter(df.status == "active").write.parquet("/output/path")
# Select only needed columns for better performancedf = spark.read.format("deltaSharing").load(table_path)result = df.select("id", "name", "timestamp")
Filtering data
# Apply filters to reduce data transferdf = spark.read.format("deltaSharing").load(table_path)filtered = df.filter((df.date >= "2024-01-01") & (df.status == "active"))
Caching for multiple queries
# Cache the DataFrame if you'll use it multiple timesdf = spark.read.format("deltaSharing").load(table_path)df.cache()# Now multiple operations won't re-fetch the datadf.count()df.groupBy("category").count().show()
Creating temporary views
# Create a temporary view for SQL accessdf = spark.read.format("deltaSharing").load(table_path)df.createOrReplaceTempView("shared_data")# Query with SQLspark.sql("SELECT * FROM shared_data WHERE age > 21").show()