Database Setup
Setting up and configuring databases for GoPie
GoPie uses multiple database systems for different purposes. This guide covers the setup and configuration of each database component.
Database Architecture
Overview
GoPie uses three main database systems:
- PostgreSQL - Metadata storage (projects, datasets, users)
- DuckDB - Analytical queries and OLAP workloads
- Qdrant - Vector storage for semantic search
PostgreSQL Setup
Installation
Docker
# docker-compose.yml
services:
postgres:
image: postgres:16-alpine
environment:
POSTGRES_USER: gopie
POSTGRES_PASSWORD: gopie
POSTGRES_DB: gopie
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U gopie"]
interval: 10s
timeout: 5s
retries: 5Local Installation
# macOS
brew install postgresql@16
brew services start postgresql@16
# Ubuntu/Debian
sudo apt update
sudo apt install postgresql-16 postgresql-contrib
sudo systemctl start postgresql
# Create database and user
sudo -u postgres psql <<EOF
CREATE USER gopie WITH PASSWORD 'gopie';
CREATE DATABASE gopie OWNER gopie;
GRANT ALL PRIVILEGES ON DATABASE gopie TO gopie;
EOFSchema Management
GoPie uses Goose for database migrations:
# Run migrations
cd server
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" up
# Check migration status
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" status
# Create new migration
goose -dir infrastructure/postgres/migrations create add_new_feature sqlDatabase Schema
-- Core tables
CREATE TABLE projects (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(255) NOT NULL,
description TEXT,
created_by UUID NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE datasets (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
description TEXT,
file_path VARCHAR(500),
file_size BIGINT,
row_count INTEGER,
column_count INTEGER,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE dataset_columns (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
dataset_id UUID NOT NULL REFERENCES datasets(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
data_type VARCHAR(50),
description TEXT,
is_nullable BOOLEAN DEFAULT true,
is_unique BOOLEAN DEFAULT false,
sample_values JSONB,
statistics JSONB
);
CREATE TABLE chats (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
user_id UUID NOT NULL,
title VARCHAR(255),
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE messages (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chat_id UUID NOT NULL REFERENCES chats(id) ON DELETE CASCADE,
role VARCHAR(50) NOT NULL,
content TEXT NOT NULL,
metadata JSONB,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
-- Indexes for performance
CREATE INDEX idx_datasets_project_id ON datasets(project_id);
CREATE INDEX idx_dataset_columns_dataset_id ON dataset_columns(dataset_id);
CREATE INDEX idx_chats_project_id ON chats(project_id);
CREATE INDEX idx_chats_user_id ON chats(user_id);
CREATE INDEX idx_messages_chat_id ON messages(chat_id);
CREATE INDEX idx_messages_created_at ON messages(created_at);Connection Pool Configuration
// server/infrastructure/postgres/connection.go
type Config struct {
DSN string
MaxConnections int // Default: 100
MaxIdleConnections int // Default: 10
ConnectionMaxLifetime time.Duration // Default: 1 hour
ConnectionMaxIdleTime time.Duration // Default: 10 minutes
}
// Recommended settings by environment
// Development
MaxConnections: 10
MaxIdleConnections: 5
// Production
MaxConnections: 100
MaxIdleConnections: 25Performance Tuning
-- postgresql.conf optimizations
-- Memory settings
shared_buffers = 256MB # 25% of system memory
effective_cache_size = 1GB # 50-75% of system memory
work_mem = 4MB # Per operation memory
maintenance_work_mem = 64MB # For VACUUM, indexes
-- Connection settings
max_connections = 200 # Adjust based on load
-- Write performance
wal_buffers = 16MB
checkpoint_segments = 32
checkpoint_completion_target = 0.9
-- Query planning
random_page_cost = 1.1 # For SSD storage
effective_io_concurrency = 200 # For SSD storageDuckDB Setup
Installation and Configuration
// server/infrastructure/duckdb/config.go
type DuckDBConfig struct {
Path string // File path or ":memory:"
ReadOnly bool
MaxMemory string // e.g., "4GB"
Threads int // Number of threads
TempDirectory string
}
// Initialize DuckDB
db, err := sql.Open("duckdb", "?access_mode=read_write&threads=4&memory_limit=4GB")DuckDB Extensions
-- Install required extensions
INSTALL 'parquet';
INSTALL 'json';
INSTALL 'httpfs';
INSTALL 'postgres';
-- Load extensions
LOAD 'parquet';
LOAD 'json';
LOAD 'httpfs';
LOAD 'postgres';Data Loading Strategies
-- Load from CSV
CREATE TABLE sales AS
SELECT * FROM read_csv_auto('s3://bucket/sales.csv', header=true);
-- Load from Parquet
CREATE TABLE events AS
SELECT * FROM read_parquet('s3://bucket/events/*.parquet');
-- Load from PostgreSQL
ATTACH 'dbname=gopie user=gopie host=localhost' AS pg (TYPE postgres);
CREATE TABLE users AS SELECT * FROM pg.users;
-- Create view for S3 data
CREATE VIEW s3_data AS
SELECT * FROM read_parquet('s3://bucket/data/*.parquet');Performance Optimization
-- Create indexes
CREATE INDEX idx_date ON sales(date);
CREATE INDEX idx_customer ON sales(customer_id);
-- Analyze tables for statistics
ANALYZE sales;
-- Optimize memory usage
SET memory_limit = '4GB';
SET threads = 4;
SET preserve_insertion_order = false;
-- Enable parallel execution
SET enable_parallel = true;
SET parallel_threshold = 1000;Qdrant Setup
Docker Installation
# docker-compose.yml
services:
qdrant:
image: qdrant/qdrant:latest
ports:
- "6333:6333"
- "6334:6334"
volumes:
- qdrant_data:/qdrant/storage
environment:
QDRANT__LOG_LEVEL: INFO
QDRANT__SERVICE__HTTP_PORT: 6333
QDRANT__SERVICE__GRPC_PORT: 6334Collection Configuration
# chat-server/app/services/qdrant_service.py
from qdrant_client import QdrantClient
from qdrant_client.models import (
Distance, VectorParams, PointStruct,
CreateCollection, OptimizersConfig,
HnswConfig
)
# Initialize client
client = QdrantClient(url="http://localhost:6333")
# Create collection for schema embeddings
client.recreate_collection(
collection_name="gopie_schemas",
vectors_config=VectorParams(
size=1536, # OpenAI embedding dimension
distance=Distance.COSINE
),
optimizers_config=OptimizersConfig(
default_segment_number=4,
max_segment_size=100000,
memmap_threshold=50000,
indexing_threshold=20000,
flush_interval_sec=10
),
hnsw_config=HnswConfig(
m=16,
ef_construct=100,
full_scan_threshold=10000
)
)
# Create indexes for metadata filtering
client.create_payload_index(
collection_name="gopie_schemas",
field_name="dataset_id",
field_schema="keyword"
)
client.create_payload_index(
collection_name="gopie_schemas",
field_name="column_name",
field_schema="keyword"
)Data Ingestion
# Prepare and index schema data
def index_dataset_schema(dataset_id: str, columns: List[Column]):
points = []
for column in columns:
# Create embedding for column
text = f"{column.name} {column.description} {column.data_type}"
embedding = get_embedding(text)
# Create point with metadata
point = PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={
"dataset_id": dataset_id,
"column_name": column.name,
"data_type": column.data_type,
"description": column.description,
"sample_values": column.sample_values
}
)
points.append(point)
# Batch upsert
client.upsert(
collection_name="gopie_schemas",
points=points
)Database Connections
Connection Strings
# PostgreSQL
DATABASE_URL=postgres://user:password@host:port/database?sslmode=require
# PostgreSQL with connection pool
DATABASE_URL=postgres://user:password@host:port/database?pool_max_conns=10
# DuckDB file-based
DUCKDB_PATH=/var/lib/gopie/analytics.duckdb
# DuckDB in-memory
DUCKDB_PATH=:memory:
# Qdrant
QDRANT_URL=http://localhost:6333
QDRANT_GRPC_URL=localhost:6334Health Checks
// PostgreSQL health check
func (db *PostgresDB) HealthCheck(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 1*time.Second)
defer cancel()
return db.PingContext(ctx)
}
// DuckDB health check
func (db *DuckDB) HealthCheck(ctx context.Context) error {
_, err := db.QueryContext(ctx, "SELECT 1")
return err
}# Qdrant health check
async def check_qdrant_health():
try:
info = await client.get_collections()
return {"status": "healthy", "collections": len(info.collections)}
except Exception as e:
return {"status": "unhealthy", "error": str(e)}Backup and Recovery
PostgreSQL Backup
# Full backup
pg_dump -h localhost -U gopie -d gopie -f gopie_backup.sql
# Compressed backup
pg_dump -h localhost -U gopie -d gopie | gzip > gopie_backup.sql.gz
# Backup specific tables
pg_dump -h localhost -U gopie -d gopie -t projects -t datasets > schema_backup.sql
# Restore from backup
psql -h localhost -U gopie -d gopie < gopie_backup.sql
# Automated backup script
#!/bin/bash
BACKUP_DIR="/backups/postgres"
DATE=$(date +%Y%m%d_%H%M%S)
pg_dump -h localhost -U gopie -d gopie | gzip > "$BACKUP_DIR/gopie_$DATE.sql.gz"
# Keep only last 7 days
find $BACKUP_DIR -name "gopie_*.sql.gz" -mtime +7 -deleteDuckDB Backup
-- Export to Parquet
COPY (SELECT * FROM analytics_table)
TO 's3://backup-bucket/analytics_backup.parquet'
(FORMAT PARQUET, COMPRESSION ZSTD);
-- Backup entire database
EXPORT DATABASE '/backup/duckdb_backup' (FORMAT PARQUET);Qdrant Backup
# Create snapshot
snapshot = client.create_snapshot(collection_name="gopie_schemas")
# Download snapshot
import requests
response = requests.get(f"http://localhost:6333/collections/gopie_schemas/snapshots/{snapshot.name}")
with open(f"backup_{snapshot.name}", "wb") as f:
f.write(response.content)
# Restore from snapshot
client.restore_snapshot(
collection_name="gopie_schemas",
location="backup_snapshot_name"
)Monitoring
PostgreSQL Monitoring
-- Active connections
SELECT count(*) FROM pg_stat_activity;
-- Long running queries
SELECT pid, now() - pg_stat_activity.query_start AS duration, query
FROM pg_stat_activity
WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes';
-- Table sizes
SELECT schemaname, tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;
-- Index usage
SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;Prometheus Metrics
# prometheus.yml
scrape_configs:
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
- job_name: 'qdrant'
static_configs:
- targets: ['localhost:6333']
metrics_path: '/metrics'Troubleshooting
Common PostgreSQL Issues
-
Connection refused:
# Check if PostgreSQL is running sudo systemctl status postgresql # Check listening ports sudo netstat -plnt | grep 5432 # Check pg_hba.conf for authentication -
Too many connections:
-- Check current connections SELECT count(*) FROM pg_stat_activity; -- Terminate idle connections SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle' AND state_change < now() - interval '10 minutes'; -
Slow queries:
-- Enable query logging ALTER SYSTEM SET log_min_duration_statement = '1000'; -- Log queries > 1 second SELECT pg_reload_conf();
DuckDB Performance Issues
-- Check memory usage
SELECT * FROM duckdb_memory();
-- Profile query
EXPLAIN ANALYZE SELECT * FROM large_table WHERE condition;
-- Force garbage collection
CALL duckdb_gc();Qdrant Issues
# Check collection info
info = client.get_collection("gopie_schemas")
print(f"Points: {info.points_count}")
print(f"Indexed: {info.indexed_vectors_count}")
print(f"Status: {info.status}")
# Optimize collection
client.update_collection(
collection_name="gopie_schemas",
optimizer_config=OptimizersConfig(
indexing_threshold=10000
)
)Next Steps
- Configure Storage
- Set up AI Providers
- Deploy with Docker
- Implement Monitoring