Database Setup

Setting up and configuring databases for GoPie

GoPie uses multiple database systems for different purposes. This guide covers the setup and configuration of each database component.

Database Architecture

Overview

GoPie uses three main database systems:

  1. PostgreSQL - Metadata storage (projects, datasets, users)
  2. DuckDB - Analytical queries and OLAP workloads
  3. Qdrant - Vector storage for semantic search

PostgreSQL Setup

Installation

Docker

# docker-compose.yml
services:
  postgres:
    image: postgres:16-alpine
    environment:
      POSTGRES_USER: gopie
      POSTGRES_PASSWORD: gopie
      POSTGRES_DB: gopie
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U gopie"]
      interval: 10s
      timeout: 5s
      retries: 5

Local Installation

# macOS
brew install postgresql@16
brew services start postgresql@16

# Ubuntu/Debian
sudo apt update
sudo apt install postgresql-16 postgresql-contrib
sudo systemctl start postgresql

# Create database and user
sudo -u postgres psql <<EOF
CREATE USER gopie WITH PASSWORD 'gopie';
CREATE DATABASE gopie OWNER gopie;
GRANT ALL PRIVILEGES ON DATABASE gopie TO gopie;
EOF

Schema Management

GoPie uses Goose for database migrations:

# Run migrations
cd server
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" up

# Check migration status
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" status

# Create new migration
goose -dir infrastructure/postgres/migrations create add_new_feature sql

Database Schema

-- Core tables
CREATE TABLE projects (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL,
    description TEXT,
    created_by UUID NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE datasets (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    description TEXT,
    file_path VARCHAR(500),
    file_size BIGINT,
    row_count INTEGER,
    column_count INTEGER,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE dataset_columns (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    dataset_id UUID NOT NULL REFERENCES datasets(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    data_type VARCHAR(50),
    description TEXT,
    is_nullable BOOLEAN DEFAULT true,
    is_unique BOOLEAN DEFAULT false,
    sample_values JSONB,
    statistics JSONB
);

CREATE TABLE chats (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
    user_id UUID NOT NULL,
    title VARCHAR(255),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE messages (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    chat_id UUID NOT NULL REFERENCES chats(id) ON DELETE CASCADE,
    role VARCHAR(50) NOT NULL,
    content TEXT NOT NULL,
    metadata JSONB,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

-- Indexes for performance
CREATE INDEX idx_datasets_project_id ON datasets(project_id);
CREATE INDEX idx_dataset_columns_dataset_id ON dataset_columns(dataset_id);
CREATE INDEX idx_chats_project_id ON chats(project_id);
CREATE INDEX idx_chats_user_id ON chats(user_id);
CREATE INDEX idx_messages_chat_id ON messages(chat_id);
CREATE INDEX idx_messages_created_at ON messages(created_at);

Connection Pool Configuration

// server/infrastructure/postgres/connection.go
type Config struct {
    DSN                  string
    MaxConnections       int           // Default: 100
    MaxIdleConnections   int           // Default: 10
    ConnectionMaxLifetime time.Duration // Default: 1 hour
    ConnectionMaxIdleTime time.Duration // Default: 10 minutes
}

// Recommended settings by environment
// Development
MaxConnections: 10
MaxIdleConnections: 5

// Production
MaxConnections: 100
MaxIdleConnections: 25

Performance Tuning

-- postgresql.conf optimizations

-- Memory settings
shared_buffers = 256MB          # 25% of system memory
effective_cache_size = 1GB      # 50-75% of system memory
work_mem = 4MB                  # Per operation memory
maintenance_work_mem = 64MB     # For VACUUM, indexes

-- Connection settings
max_connections = 200           # Adjust based on load

-- Write performance
wal_buffers = 16MB
checkpoint_segments = 32
checkpoint_completion_target = 0.9

-- Query planning
random_page_cost = 1.1          # For SSD storage
effective_io_concurrency = 200  # For SSD storage

DuckDB Setup

Installation and Configuration

// server/infrastructure/duckdb/config.go
type DuckDBConfig struct {
    Path         string // File path or ":memory:"
    ReadOnly     bool
    MaxMemory    string // e.g., "4GB"
    Threads      int    // Number of threads
    TempDirectory string
}

// Initialize DuckDB
db, err := sql.Open("duckdb", "?access_mode=read_write&threads=4&memory_limit=4GB")

DuckDB Extensions

-- Install required extensions
INSTALL 'parquet';
INSTALL 'json';
INSTALL 'httpfs';
INSTALL 'postgres';

-- Load extensions
LOAD 'parquet';
LOAD 'json';
LOAD 'httpfs';
LOAD 'postgres';

Data Loading Strategies

-- Load from CSV
CREATE TABLE sales AS 
SELECT * FROM read_csv_auto('s3://bucket/sales.csv', header=true);

-- Load from Parquet
CREATE TABLE events AS 
SELECT * FROM read_parquet('s3://bucket/events/*.parquet');

-- Load from PostgreSQL
ATTACH 'dbname=gopie user=gopie host=localhost' AS pg (TYPE postgres);
CREATE TABLE users AS SELECT * FROM pg.users;

-- Create view for S3 data
CREATE VIEW s3_data AS 
SELECT * FROM read_parquet('s3://bucket/data/*.parquet');

Performance Optimization

-- Create indexes
CREATE INDEX idx_date ON sales(date);
CREATE INDEX idx_customer ON sales(customer_id);

-- Analyze tables for statistics
ANALYZE sales;

-- Optimize memory usage
SET memory_limit = '4GB';
SET threads = 4;
SET preserve_insertion_order = false;

-- Enable parallel execution
SET enable_parallel = true;
SET parallel_threshold = 1000;

Qdrant Setup

Docker Installation

# docker-compose.yml
services:
  qdrant:
    image: qdrant/qdrant:latest
    ports:
      - "6333:6333"
      - "6334:6334"
    volumes:
      - qdrant_data:/qdrant/storage
    environment:
      QDRANT__LOG_LEVEL: INFO
      QDRANT__SERVICE__HTTP_PORT: 6333
      QDRANT__SERVICE__GRPC_PORT: 6334

Collection Configuration

# chat-server/app/services/qdrant_service.py
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, VectorParams, PointStruct,
    CreateCollection, OptimizersConfig,
    HnswConfig
)

# Initialize client
client = QdrantClient(url="http://localhost:6333")

# Create collection for schema embeddings
client.recreate_collection(
    collection_name="gopie_schemas",
    vectors_config=VectorParams(
        size=1536,  # OpenAI embedding dimension
        distance=Distance.COSINE
    ),
    optimizers_config=OptimizersConfig(
        default_segment_number=4,
        max_segment_size=100000,
        memmap_threshold=50000,
        indexing_threshold=20000,
        flush_interval_sec=10
    ),
    hnsw_config=HnswConfig(
        m=16,
        ef_construct=100,
        full_scan_threshold=10000
    )
)

# Create indexes for metadata filtering
client.create_payload_index(
    collection_name="gopie_schemas",
    field_name="dataset_id",
    field_schema="keyword"
)

client.create_payload_index(
    collection_name="gopie_schemas",
    field_name="column_name",
    field_schema="keyword"
)

Data Ingestion

# Prepare and index schema data
def index_dataset_schema(dataset_id: str, columns: List[Column]):
    points = []
    
    for column in columns:
        # Create embedding for column
        text = f"{column.name} {column.description} {column.data_type}"
        embedding = get_embedding(text)
        
        # Create point with metadata
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "dataset_id": dataset_id,
                "column_name": column.name,
                "data_type": column.data_type,
                "description": column.description,
                "sample_values": column.sample_values
            }
        )
        points.append(point)
    
    # Batch upsert
    client.upsert(
        collection_name="gopie_schemas",
        points=points
    )

Database Connections

Connection Strings

# PostgreSQL
DATABASE_URL=postgres://user:password@host:port/database?sslmode=require

# PostgreSQL with connection pool
DATABASE_URL=postgres://user:password@host:port/database?pool_max_conns=10

# DuckDB file-based
DUCKDB_PATH=/var/lib/gopie/analytics.duckdb

# DuckDB in-memory
DUCKDB_PATH=:memory:

# Qdrant
QDRANT_URL=http://localhost:6333
QDRANT_GRPC_URL=localhost:6334

Health Checks

// PostgreSQL health check
func (db *PostgresDB) HealthCheck(ctx context.Context) error {
    ctx, cancel := context.WithTimeout(ctx, 1*time.Second)
    defer cancel()
    
    return db.PingContext(ctx)
}

// DuckDB health check
func (db *DuckDB) HealthCheck(ctx context.Context) error {
    _, err := db.QueryContext(ctx, "SELECT 1")
    return err
}
# Qdrant health check
async def check_qdrant_health():
    try:
        info = await client.get_collections()
        return {"status": "healthy", "collections": len(info.collections)}
    except Exception as e:
        return {"status": "unhealthy", "error": str(e)}

Backup and Recovery

PostgreSQL Backup

# Full backup
pg_dump -h localhost -U gopie -d gopie -f gopie_backup.sql

# Compressed backup
pg_dump -h localhost -U gopie -d gopie | gzip > gopie_backup.sql.gz

# Backup specific tables
pg_dump -h localhost -U gopie -d gopie -t projects -t datasets > schema_backup.sql

# Restore from backup
psql -h localhost -U gopie -d gopie < gopie_backup.sql

# Automated backup script
#!/bin/bash
BACKUP_DIR="/backups/postgres"
DATE=$(date +%Y%m%d_%H%M%S)
pg_dump -h localhost -U gopie -d gopie | gzip > "$BACKUP_DIR/gopie_$DATE.sql.gz"

# Keep only last 7 days
find $BACKUP_DIR -name "gopie_*.sql.gz" -mtime +7 -delete

DuckDB Backup

-- Export to Parquet
COPY (SELECT * FROM analytics_table) 
TO 's3://backup-bucket/analytics_backup.parquet' 
(FORMAT PARQUET, COMPRESSION ZSTD);

-- Backup entire database
EXPORT DATABASE '/backup/duckdb_backup' (FORMAT PARQUET);

Qdrant Backup

# Create snapshot
snapshot = client.create_snapshot(collection_name="gopie_schemas")

# Download snapshot
import requests
response = requests.get(f"http://localhost:6333/collections/gopie_schemas/snapshots/{snapshot.name}")
with open(f"backup_{snapshot.name}", "wb") as f:
    f.write(response.content)

# Restore from snapshot
client.restore_snapshot(
    collection_name="gopie_schemas",
    location="backup_snapshot_name"
)

Monitoring

PostgreSQL Monitoring

-- Active connections
SELECT count(*) FROM pg_stat_activity;

-- Long running queries
SELECT pid, now() - pg_stat_activity.query_start AS duration, query 
FROM pg_stat_activity 
WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes';

-- Table sizes
SELECT schemaname, tablename, 
       pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables 
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Index usage
SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;

Prometheus Metrics

# prometheus.yml
scrape_configs:
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']
  
  - job_name: 'qdrant'
    static_configs:
      - targets: ['localhost:6333']
    metrics_path: '/metrics'

Troubleshooting

Common PostgreSQL Issues

  1. Connection refused:

    # Check if PostgreSQL is running
    sudo systemctl status postgresql
    
    # Check listening ports
    sudo netstat -plnt | grep 5432
    
    # Check pg_hba.conf for authentication
  2. Too many connections:

    -- Check current connections
    SELECT count(*) FROM pg_stat_activity;
    
    -- Terminate idle connections
    SELECT pg_terminate_backend(pid) 
    FROM pg_stat_activity 
    WHERE state = 'idle' AND state_change < now() - interval '10 minutes';
  3. Slow queries:

    -- Enable query logging
    ALTER SYSTEM SET log_min_duration_statement = '1000';  -- Log queries > 1 second
    SELECT pg_reload_conf();

DuckDB Performance Issues

-- Check memory usage
SELECT * FROM duckdb_memory();

-- Profile query
EXPLAIN ANALYZE SELECT * FROM large_table WHERE condition;

-- Force garbage collection
CALL duckdb_gc();

Qdrant Issues

# Check collection info
info = client.get_collection("gopie_schemas")
print(f"Points: {info.points_count}")
print(f"Indexed: {info.indexed_vectors_count}")
print(f"Status: {info.status}")

# Optimize collection
client.update_collection(
    collection_name="gopie_schemas",
    optimizer_config=OptimizersConfig(
        indexing_threshold=10000
    )
)

Next Steps