What is DuckDB?
DuckDB is an in-process SQL OLAP database designed for fast analytical query processing. Unlike traditional database servers, DuckDB runs embedded within applications, making it ideal for data analysis, ETL pipelines, and analytical workloads.
Embedded
No separate server process required
Columnar Storage
Optimized for analytical queries
Zero Dependencies
Single binary, no external requirements
Prerequisites & VPS Selection
Required Software: wget, curl, unzip, Python 3 (optional)
sudo apt update
sudo apt install -y wget curl unzip build-essential python3 python3-pipInstall DuckDB
Method 1: Binary Installation (Recommended)
# Download the latest DuckDB CLI binary
wget https://github.com/duckdb/duckdb/releases/latest/download/duckdb_cli-linux-amd64.zip
# Unzip the binary
unzip duckdb_cli-linux-amd64.zip
# Move to system binary directory
sudo mv duckdb /usr/local/bin/
# Make executable
sudo chmod +x /usr/local/bin/duckdb
# Verify installation
duckdb --versionMethod 2: Python Installation
# Install DuckDB Python package
pip3 install duckdb
# Verify installation
python3 -c "import duckdb; print(duckdb.__version__)"Method 3: Build from Source
# Install build dependencies
sudo apt install -y git cmake ninja-build
# Clone the repository
git clone https://github.com/duckdb/duckdb.git
cd duckdb
# Build DuckDB
make
# Install the binary
sudo cp build/release/duckdb /usr/local/bin/
# Verify installation
duckdb --versionBasic Database Usage
# Create persistent database
duckdb mydata.db
# Or use in-memory database (data lost on exit)
duckdb :memory:-- Create a sample table
CREATE TABLE users (
id INTEGER PRIMARY KEY,
name VARCHAR,
email VARCHAR,
created_at TIMESTAMP
);
-- Insert sample data
INSERT INTO users VALUES
(1, 'Alice Johnson', 'alice@example.com', '2024-01-15 10:30:00'),
(2, 'Bob Smith', 'bob@example.com', '2024-01-16 14:20:00'),
(3, 'Carol White', 'carol@example.com', '2024-01-17 09:45:00');
-- Query the data
SELECT * FROM users WHERE created_at >= '2024-01-16';Working with File Formats
CSV Files
-- Read CSV directly without importing
SELECT * FROM read_csv_auto('data.csv');
-- Create table from CSV
CREATE TABLE sales AS
SELECT * FROM read_csv_auto('sales_data.csv');
-- Export query results to CSV
COPY (SELECT * FROM sales WHERE amount > 1000)
TO 'high_value_sales.csv' (HEADER, DELIMITER ',');Parquet Files
-- Query Parquet files directly
SELECT * FROM read_parquet('data.parquet');
-- Create table from multiple Parquet files
CREATE TABLE analytics AS
SELECT * FROM read_parquet('analytics_*.parquet');
-- Export to Parquet (compressed)
COPY users TO 'users.parquet' (FORMAT PARQUET, COMPRESSION ZSTD);JSON Files
-- Read JSON files
SELECT * FROM read_json_auto('data.json');
-- Read JSONL (newline-delimited JSON)
SELECT * FROM read_json_auto('logs.jsonl', format='newline_delimited');
-- Extract JSON fields
SELECT
data->>'$.name' as name,
data->>'$.age' as age
FROM read_json_auto('users.json');Python Integration
Basic Python Usage
import duckdb
# Create connection
con = duckdb.connect('analytics.db')
# Execute query
result = con.execute("""
SELECT
date_trunc('day', timestamp) as day,
COUNT(*) as events,
AVG(value) as avg_value
FROM events
WHERE timestamp >= '2024-01-01'
GROUP BY day
ORDER BY day
""").fetchall()
# Print results
for row in result:
print(f"Day: {row[0]}, Events: {row[1]}, Avg Value: {row[2]:.2f}")
# Close connection
con.close()Pandas Integration
import duckdb
import pandas as pd
# Create connection
con = duckdb.connect('mydata.db')
# Query to DataFrame
df = con.execute("SELECT * FROM users").df()
# Write DataFrame to DuckDB
con.execute("CREATE TABLE new_users AS SELECT * FROM df")
# Register DataFrame as virtual table
con.register('pandas_users', df)
result = con.execute("SELECT * FROM pandas_users WHERE name LIKE 'A%'").df()ETL Pipeline Example
import duckdb
import requests
from datetime import datetime
def etl_pipeline():
con = duckdb.connect('etl_data.db')
# Extract: Fetch data from API
response = requests.get('https://api.example.com/data')
data = response.json()
# Transform: Process with DuckDB
con.execute("CREATE TABLE IF NOT EXISTS raw_data (id INT, value VARCHAR, timestamp TIMESTAMP)")
# Load: Insert data
for record in data:
con.execute("""
INSERT INTO raw_data VALUES (?, ?, ?)
""", [record['id'], record['value'], datetime.now()])
# Aggregate
result = con.execute("""
SELECT
DATE_TRUNC('hour', timestamp) as hour,
COUNT(*) as record_count,
COUNT(DISTINCT id) as unique_ids
FROM raw_data
GROUP BY hour
ORDER BY hour DESC
LIMIT 24
""").fetchdf()
return result
if __name__ == "__main__":
print(etl_pipeline())Web API Integration
Flask API Example
from flask import Flask, jsonify, request
import duckdb
app = Flask(__name__)
db_connection = duckdb.connect('api_data.db', read_only=False)
@app.route('/api/query', methods=['POST'])
def execute_query():
try:
query = request.json.get('query')
result = db_connection.execute(query).fetchall()
columns = [desc[0] for desc in db_connection.description]
return jsonify({
'success': True,
'columns': columns,
'data': result
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 400
@app.route('/api/analytics/summary', methods=['GET'])
def get_summary():
result = db_connection.execute("""
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT user_id) as unique_users,
MAX(timestamp) as last_updated
FROM events
""").fetchone()
return jsonify({
'total_records': result[0],
'unique_users': result[1],
'last_updated': str(result[2])
})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)pip3 install flask
python3 app.pyFastAPI Example
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import duckdb
app = FastAPI()
db = duckdb.connect('analytics.db')
class QueryRequest(BaseModel):
sql: str
limit: int = 100
@app.get("/")
def read_root():
return {"message": "DuckDB API Server"}
@app.post("/query")
def execute_query(request: QueryRequest):
try:
result = db.execute(f"{request.sql} LIMIT {request.limit}").fetchall()
columns = [desc[0] for desc in db.description]
return {
"columns": columns,
"rows": result,
"row_count": len(result)
}
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/tables")
def list_tables():
result = db.execute("SHOW TABLES").fetchall()
return {"tables": [r[0] for r in result]}pip3 install fastapi uvicorn
uvicorn fastapi_app:app --host 0.0.0.0 --port 8000Performance Optimization
Query Optimization Tips
- 1. Use appropriate data types: Smaller types reduce memory usage
- 2. Create indexes:
CREATE INDEX idx_user_email ON users(email) - 3. Partition large tables: Split data by date or category
- 4. Use columnar storage: Parquet files for large datasets
- 5. Batch inserts: Insert multiple rows in single transaction
-- Enable profiling
PRAGMA enable_profiling;
-- Run query
SELECT * FROM large_table WHERE date > '2024-01-01';
-- View execution plan
EXPLAIN SELECT * FROM large_table WHERE date > '2024-01-01';
-- View detailed query graph
EXPLAIN ANALYZE SELECT * FROM large_table WHERE date > '2024-01-01';Security Configuration
File Permissions
# Set appropriate permissions for database files
chmod 640 mydata.db
chown www-data:www-data mydata.db
# Restrict directory access
chmod 750 /path/to/duckdb/dataRead-Only Connections
# Open database in read-only mode
con = duckdb.connect('mydata.db', read_only=True)Input Validation
# Bad - vulnerable to SQL injection
user_input = request.args.get('email')
con.execute(f"SELECT * FROM users WHERE email = '{user_input}'")
# Good - parameterized query
user_input = request.args.get('email')
con.execute("SELECT * FROM users WHERE email = ?", [user_input])# Use firewall to restrict access
sudo ufw allow from 10.0.0.0/8 to any port 5000
# Use nginx reverse proxy with SSL
sudo apt install nginx certbot python3-certbot-nginxBackup and Maintenance
Backup Strategies
# Simple file copy (database must be closed)
cp mydata.db mydata.db.backup
# Export to SQL dump
duckdb mydata.db -c ".dump" > backup.sql
# Export to Parquet for archival
duckdb mydata.db -c "COPY (SELECT * FROM important_table) TO 'backup.parquet'"Automated Backup Script
#!/bin/bash
# backup_duckdb.sh
DB_PATH="/var/duckdb/data/mydata.db"
BACKUP_DIR="/var/backups/duckdb"
DATE=$(date +%Y%m%d_%H%M%S)
# Create backup directory
mkdir -p "$BACKUP_DIR"
# Copy database file
cp "$DB_PATH" "$BACKUP_DIR/mydata_$DATE.db"
# Compress old backups
find "$BACKUP_DIR" -name "*.db" -mtime +1 -exec gzip {} \;
# Remove backups older than 30 days
find "$BACKUP_DIR" -name "*.db.gz" -mtime +30 -delete
echo "Backup completed: $DATE"# Add to crontab
crontab -e
# Add daily backup at 2 AM
0 2 * * * /path/to/backup_duckdb.shDatabase Maintenance
-- Reclaim space from deleted rows
VACUUM;
-- Analyze tables for query optimization
ANALYZE;
-- Check database integrity
PRAGMA integrity_check;Common Use Cases
Troubleshooting
Best Practices
- 1. Use appropriate storage format: Parquet for large datasets, CSV for small ones
- 2. Implement connection pooling: For multi-threaded applications
- 3. Regular maintenance: Run VACUUM and ANALYZE periodically
- 4. Monitor resource usage: Set memory limits appropriate for your VPS
- 5. Version control: Track schema changes with migration scripts
- 6. Test queries: Use EXPLAIN ANALYZE before production deployment
- 7. Backup regularly: Automate backups with cron jobs
- 8. Security first: Use read-only connections when possible
Deployment Complete!
DuckDB is ready for data analysis, ETL pipelines, and analytical applications on your RamNode VPS. Its zero-dependency architecture and excellent performance make it ideal for embedded analytics.
Ready to Deploy DuckDB?
Get started with a RamNode VPS optimized for analytical workloads.
View VPS Plans