An in-process SQL OLAP database management system (original) (raw)
Search Shortcut cmd + k | ctrl + k
DuckDB is a fast
|
database system
DuckDB supports a feature-rich SQL dialect complemented with deep integrations into client APIs.
- SQL
- Python
- R
- Java
- Node.js
-- Get the top-3 busiest train stations
SELECT station_name, count(*) AS num_services
FROM train_services
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;
DuckDB at a glance
Simple
DuckDB is easy to install and deploy. It has zero external dependencies and runs in-process in its host application or as a single binary.
Portable
DuckDB runs on Linux, macOS, Windows, and all popular hardware architectures. It has idiomatic client APIs for major programming languages.
Feature-rich
DuckDB offers a rich SQL dialect. It can read and write file formats such as CSV, Parquet, and JSON, to and from the local file system and remote endpoints such as S3 buckets.
Fast
DuckDB runs analytical queries at blazing speed thanks to its columnar engine, which supports parallel execution and can process larger-than-memory workloads.
Extensible
DuckDB is extensible by third-party features such as new data types, functions, file formats and new SQL syntax.
Free
DuckDB and its core extensions are open-source under the permissive MIT License.
Installation
DuckDB is seamlessly integrated with major programming languages. It can be installed in less than 20 seconds on most platforms.
- Command line
- Python
- R
- Java
- Node.js
- ODBC
Latest release: DuckDB 1.1.3 | System detected:
install.packages("duckdb")
<dependency>
<groupId>org.duckdb</groupId>
<artifactId>duckdb_jdbc</artifactId>
<version>1.1.3</version>
</dependency>
winget install DuckDB.cli
Aggregation query Join query Read CSV files Read Parquet files from S3 Spatial extension
SQL query Pandas integration Custom UDFs
SQL query dplyr integration
SQL query Appender
SQL query Web service integration
-- Get the top-3 busiest train stations
SELECT station_name, count(*) AS num_services
FROM train_services
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;
-- Load CSV file, auto-detecting column name and types
CREATE TABLE stations AS
FROM 's3://duckdb-blobs/stations.csv';
-- Directly query Parquet file in S3
SELECT station_name, count(*) AS num_services
FROM 's3://duckdb-blobs/train_services.parquet'
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 10;
-- Find the top-3 longest domestic train routes
SELECT s1.name_short, s2.name_short, distances.distance
FROM distances
JOIN stations s1 ON distances.station1 = s1.code
JOIN stations s2 ON distances.station2 = s2.code
WHERE s1.country = s2.country
AND s1.code < s2.code
ORDER BY distance DESC
LIMIT 3;
-- What are the top-3 closest Intercity stations
-- using aerial distance?
SELECT
s1.name_long AS station1,
s2.name_long AS station2,
ST_Distance(
ST_Point(s1.geo_lng, s1.geo_lat),
ST_Point(s2.geo_lng, s2.geo_lat)
) * 111_139 AS distance
FROM stations s1, stations s2
WHERE s1.type LIKE '%Intercity%'
AND s2.type LIKE '%Intercity%'
AND s1.id < s2.id
ORDER BY distance ASC
LIMIT 3;
# Get the top-3 busiest train stations
import duckdb
duckdb.sql("""
SELECT station, count(*) AS num_services
FROM train_services
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;
""")
# Reading and writing Pandas dataframes
import pandas as pd
import duckdb
df_in = pd.DataFrame({
'station': ['Delft', 'Delft', 'Gouda', 'Gouda'],
'day': ['Mon', 'Tue', 'Mon', 'Tue'],
'num_services' : [22, 20, 27, 25]})
# Run query on a dataframe and return a dataframe
df_out = duckdb.sql("""
SELECT station, sum(num_services)
FROM df_in
GROUP BY station
""").to_df()
# Create custom user-defined function
import duckdb
def add_plus_one(x):
return x + 1
con = duckdb.connect()
con.create_function('add_plus_one', add_plus_one,
['BIGINT'], 'BIGINT', type='native')
con.sql("""
SELECT sum(add_plus_one(i)) FROM range(10) tbl(i);
""")
# Find the largest sepals & petals in the Iris data set
library(duckdb)
con <- dbConnect(duckdb())
duckdb_register(con, "iris", iris)
query <- r'(
SELECT count(*) AS num_observations,
max("Sepal.Width") AS max_width,
max("Petal.Length") AS max_petal_length
FROM iris
WHERE "Sepal.Length" > 5
GROUP BY ALL
)'
dbGetQuery(con, query)
# Integration with dplyr
# Find the largest sepals and petals in the Iris data set
library("duckdb")
library("dplyr")
con <- dbConnect(duckdb())
duckdb_register(con, "iris", iris)
tbl(con, "iris") |>
filter(Sepal.Length > 5) |>
group_by(Species) |>
summarize(num_observations = count(),
max_width = max(Sepal.Width),
max_petal_length = max(Petal.Length),
na.rm = TRUE) |>
collect()
// Get a list of train stations by traffic
Connection conn =
DriverManager.getConnection("jdbc:duckdb:");
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(
"SELECT station_name, count(*) AS num_services\n" +
"FROM train_services\n" +
"GROUP BY ALL\n" +
"ORDER BY num_services DESC;");
while (rs.next()) {
System.out.println(rs.getString(1));
System.out.println(rs.getInt(2));
}
// Using the appender for bulk inserts
DuckDBConnection conn = (DuckDBConnection) DriverManager.getConnection("jdbc:duckdb:");
Statement stmt = conn.createStatement();
stmt.execute("CREATE TABLE person (first_name VARCHAR, last_name VARCHAR, age INT)");
try (var appender = conn.createAppender(
DuckDBConnection.DEFAULT_SCHEMA, "tbl"
)) {
appender.beginRow();
appender.append("John");
appender.append("Smith");
appender.append(42);
appender.endRow();
}
// Get the top-3 busiest train stations in May
const duckdb = require('duckdb');
const db = new duckdb.Database(':memory:');
db.all(
`SELECT station_name, count(*) AS num_services
FROM 'http://blobs.duckdb.org/train_services.parquet'
WHERE monthname(date) = 'May'
GROUP BY ALL
ORDER BY num_services DESC
LIMIT 3;`,
(err, res) => {
if (err) {
console.log("Error", err);
} else {
console.table(res);
}
}
);
// Web Service Integration:
// create endpoint to generate numbers
const express = require("express");
const duckdb = require("duckdb");
const app = express();
const db = new duckdb.Database(":memory:");
app.get("/getnumbers", (req, res) => {
db.all("SELECT random() AS num FROM range(10)", (a, b) => {
if (a) {
console.warn(a);
res.end("Error " + a);
}
res.end(JSON.stringify(b));
});
});
app.listen(8082, () => console.log("Go to: http://localhost:8082/getnumbers"));