Database Setup

GoPie uses multiple database systems for different purposes. This guide covers the setup and configuration of each database component.

Database Architecture

Overview

GoPie uses three main database systems:

PostgreSQL - Metadata storage (projects, datasets, users)
DuckDB - Analytical queries and OLAP workloads
Qdrant - Vector storage for semantic search

PostgreSQL Setup

Installation

Docker

# docker-compose.yml
services:
  postgres:
    image: postgres:16-alpine
    environment:
      POSTGRES_USER: gopie
      POSTGRES_PASSWORD: gopie
      POSTGRES_DB: gopie
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U gopie"]
      interval: 10s
      timeout: 5s
      retries: 5

Local Installation

# macOS
brew install postgresql@16
brew services start postgresql@16

# Ubuntu/Debian
sudo apt update
sudo apt install postgresql-16 postgresql-contrib
sudo systemctl start postgresql

# Create database and user
sudo -u postgres psql <<EOF
CREATE USER gopie WITH PASSWORD 'gopie';
CREATE DATABASE gopie OWNER gopie;
GRANT ALL PRIVILEGES ON DATABASE gopie TO gopie;
EOF

Schema Management

GoPie uses Goose for database migrations:

# Run migrations
cd server
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" up

# Check migration status
goose -dir infrastructure/postgres/migrations postgres "$DATABASE_URL" status

# Create new migration
goose -dir infrastructure/postgres/migrations create add_new_feature sql

Database Schema

-- Core tables
CREATE TABLE projects (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    name VARCHAR(255) NOT NULL,
    description TEXT,
    created_by UUID NOT NULL,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE datasets (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    description TEXT,
    file_path VARCHAR(500),
    file_size BIGINT,
    row_count INTEGER,
    column_count INTEGER,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE dataset_columns (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    dataset_id UUID NOT NULL REFERENCES datasets(id) ON DELETE CASCADE,
    name VARCHAR(255) NOT NULL,
    data_type VARCHAR(50),
    description TEXT,
    is_nullable BOOLEAN DEFAULT true,
    is_unique BOOLEAN DEFAULT false,
    sample_values JSONB,
    statistics JSONB
);

CREATE TABLE chats (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    project_id UUID NOT NULL REFERENCES projects(id) ON DELETE CASCADE,
    user_id UUID NOT NULL,
    title VARCHAR(255),
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE messages (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    chat_id UUID NOT NULL REFERENCES chats(id) ON DELETE CASCADE,
    role VARCHAR(50) NOT NULL,
    content TEXT NOT NULL,
    metadata JSONB,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);

-- Indexes for performance
CREATE INDEX idx_datasets_project_id ON datasets(project_id);
CREATE INDEX idx_dataset_columns_dataset_id ON dataset_columns(dataset_id);
CREATE INDEX idx_chats_project_id ON chats(project_id);
CREATE INDEX idx_chats_user_id ON chats(user_id);
CREATE INDEX idx_messages_chat_id ON messages(chat_id);
CREATE INDEX idx_messages_created_at ON messages(created_at);

Connection Pool Configuration

// server/infrastructure/postgres/connection.go
type Config struct {
    DSN                  string
    MaxConnections       int           // Default: 100
    MaxIdleConnections   int           // Default: 10
    ConnectionMaxLifetime time.Duration // Default: 1 hour
    ConnectionMaxIdleTime time.Duration // Default: 10 minutes
}

// Recommended settings by environment
// Development
MaxConnections: 10
MaxIdleConnections: 5

// Production
MaxConnections: 100
MaxIdleConnections: 25

Performance Tuning

-- postgresql.conf optimizations

-- Memory settings
shared_buffers = 256MB          # 25% of system memory
effective_cache_size = 1GB      # 50-75% of system memory
work_mem = 4MB                  # Per operation memory
maintenance_work_mem = 64MB     # For VACUUM, indexes

-- Connection settings
max_connections = 200           # Adjust based on load

-- Write performance
wal_buffers = 16MB
checkpoint_segments = 32
checkpoint_completion_target = 0.9

-- Query planning
random_page_cost = 1.1          # For SSD storage
effective_io_concurrency = 200  # For SSD storage

DuckDB Setup

Installation and Configuration

// server/infrastructure/duckdb/config.go
type DuckDBConfig struct {
    Path         string // File path or ":memory:"
    ReadOnly     bool
    MaxMemory    string // e.g., "4GB"
    Threads      int    // Number of threads
    TempDirectory string
}

// Initialize DuckDB
db, err := sql.Open("duckdb", "?access_mode=read_write&threads=4&memory_limit=4GB")

DuckDB Extensions

-- Install required extensions
INSTALL 'parquet';
INSTALL 'json';
INSTALL 'httpfs';
INSTALL 'postgres';

-- Load extensions
LOAD 'parquet';
LOAD 'json';
LOAD 'httpfs';
LOAD 'postgres';

Data Loading Strategies

-- Load from CSV
CREATE TABLE sales AS 
SELECT * FROM read_csv_auto('s3://bucket/sales.csv', header=true);

-- Load from Parquet
CREATE TABLE events AS 
SELECT * FROM read_parquet('s3://bucket/events/*.parquet');

-- Load from PostgreSQL
ATTACH 'dbname=gopie user=gopie host=localhost' AS pg (TYPE postgres);
CREATE TABLE users AS SELECT * FROM pg.users;

-- Create view for S3 data
CREATE VIEW s3_data AS 
SELECT * FROM read_parquet('s3://bucket/data/*.parquet');

Performance Optimization

-- Create indexes
CREATE INDEX idx_date ON sales(date);
CREATE INDEX idx_customer ON sales(customer_id);

-- Analyze tables for statistics
ANALYZE sales;

-- Optimize memory usage
SET memory_limit = '4GB';
SET threads = 4;
SET preserve_insertion_order = false;

-- Enable parallel execution
SET enable_parallel = true;
SET parallel_threshold = 1000;

Qdrant Setup

Docker Installation

# docker-compose.yml
services:
  qdrant:
    image: qdrant/qdrant:latest
    ports:
      - "6333:6333"
      - "6334:6334"
    volumes:
      - qdrant_data:/qdrant/storage
    environment:
      QDRANT__LOG_LEVEL: INFO
      QDRANT__SERVICE__HTTP_PORT: 6333
      QDRANT__SERVICE__GRPC_PORT: 6334

Collection Configuration

# chat-server/app/services/qdrant_service.py
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, VectorParams, PointStruct,
    CreateCollection, OptimizersConfig,
    HnswConfig
)

# Initialize client
client = QdrantClient(url="http://localhost:6333")

# Create collection for schema embeddings
client.recreate_collection(
    collection_name="gopie_schemas",
    vectors_config=VectorParams(
        size=1536,  # OpenAI embedding dimension
        distance=Distance.COSINE
    ),
    optimizers_config=OptimizersConfig(
        default_segment_number=4,
        max_segment_size=100000,
        memmap_threshold=50000,
        indexing_threshold=20000,
        flush_interval_sec=10
    ),
    hnsw_config=HnswConfig(
        m=16,
        ef_construct=100,
        full_scan_threshold=10000
    )
)

# Create indexes for metadata filtering
client.create_payload_index(
    collection_name="gopie_schemas",
    field_name="dataset_id",
    field_schema="keyword"
)

client.create_payload_index(
    collection_name="gopie_schemas",
    field_name="column_name",
    field_schema="keyword"
)

Data Ingestion

# Prepare and index schema data
def index_dataset_schema(dataset_id: str, columns: List[Column]):
    points = []
    
    for column in columns:
        # Create embedding for column
        text = f"{column.name} {column.description} {column.data_type}"
        embedding = get_embedding(text)
        
        # Create point with metadata
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "dataset_id": dataset_id,
                "column_name": column.name,
                "data_type": column.data_type,
                "description": column.description,
                "sample_values": column.sample_values
            }
        )
        points.append(point)
    
    # Batch upsert
    client.upsert(
        collection_name="gopie_schemas",
        points=points
    )

Database Connections

Connection Strings

# PostgreSQL
DATABASE_URL=postgres://user:password@host:port/database?sslmode=require

# PostgreSQL with connection pool
DATABASE_URL=postgres://user:password@host:port/database?pool_max_conns=10

# DuckDB file-based
DUCKDB_PATH=/var/lib/gopie/analytics.duckdb

# DuckDB in-memory
DUCKDB_PATH=:memory:

# Qdrant
QDRANT_URL=http://localhost:6333
QDRANT_GRPC_URL=localhost:6334

Health Checks

// PostgreSQL health check
func (db *PostgresDB) HealthCheck(ctx context.Context) error {
    ctx, cancel := context.WithTimeout(ctx, 1*time.Second)
    defer cancel()
    
    return db.PingContext(ctx)
}

// DuckDB health check
func (db *DuckDB) HealthCheck(ctx context.Context) error {
    _, err := db.QueryContext(ctx, "SELECT 1")
    return err
}

# Qdrant health check
async def check_qdrant_health():
    try:
        info = await client.get_collections()
        return {"status": "healthy", "collections": len(info.collections)}
    except Exception as e:
        return {"status": "unhealthy", "error": str(e)}

Backup and Recovery

PostgreSQL Backup

# Full backup
pg_dump -h localhost -U gopie -d gopie -f gopie_backup.sql

# Compressed backup
pg_dump -h localhost -U gopie -d gopie | gzip > gopie_backup.sql.gz

# Backup specific tables
pg_dump -h localhost -U gopie -d gopie -t projects -t datasets > schema_backup.sql

# Restore from backup
psql -h localhost -U gopie -d gopie < gopie_backup.sql

# Automated backup script
#!/bin/bash
BACKUP_DIR="/backups/postgres"
DATE=$(date +%Y%m%d_%H%M%S)
pg_dump -h localhost -U gopie -d gopie | gzip > "$BACKUP_DIR/gopie_$DATE.sql.gz"

# Keep only last 7 days
find $BACKUP_DIR -name "gopie_*.sql.gz" -mtime +7 -delete

DuckDB Backup

-- Export to Parquet
COPY (SELECT * FROM analytics_table) 
TO 's3://backup-bucket/analytics_backup.parquet' 
(FORMAT PARQUET, COMPRESSION ZSTD);

-- Backup entire database
EXPORT DATABASE '/backup/duckdb_backup' (FORMAT PARQUET);

Qdrant Backup

# Create snapshot
snapshot = client.create_snapshot(collection_name="gopie_schemas")

# Download snapshot
import requests
response = requests.get(f"http://localhost:6333/collections/gopie_schemas/snapshots/{snapshot.name}")
with open(f"backup_{snapshot.name}", "wb") as f:
    f.write(response.content)

# Restore from snapshot
client.restore_snapshot(
    collection_name="gopie_schemas",
    location="backup_snapshot_name"
)

Monitoring

PostgreSQL Monitoring

-- Active connections
SELECT count(*) FROM pg_stat_activity;

-- Long running queries
SELECT pid, now() - pg_stat_activity.query_start AS duration, query 
FROM pg_stat_activity 
WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes';

-- Table sizes
SELECT schemaname, tablename, 
       pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables 
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC;

-- Index usage
SELECT schemaname, tablename, indexname, idx_scan, idx_tup_read, idx_tup_fetch
FROM pg_stat_user_indexes
ORDER BY idx_scan DESC;

Prometheus Metrics

# prometheus.yml
scrape_configs:
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']
  
  - job_name: 'qdrant'
    static_configs:
      - targets: ['localhost:6333']
    metrics_path: '/metrics'

Troubleshooting

Common PostgreSQL Issues

Connection refused:

# Check if PostgreSQL is running
sudo systemctl status postgresql

# Check listening ports
sudo netstat -plnt | grep 5432

# Check pg_hba.conf for authentication

Too many connections:

-- Check current connections
SELECT count(*) FROM pg_stat_activity;

-- Terminate idle connections
SELECT pg_terminate_backend(pid) 
FROM pg_stat_activity 
WHERE state = 'idle' AND state_change < now() - interval '10 minutes';

Slow queries:

-- Enable query logging
ALTER SYSTEM SET log_min_duration_statement = '1000';  -- Log queries > 1 second
SELECT pg_reload_conf();

DuckDB Performance Issues

-- Check memory usage
SELECT * FROM duckdb_memory();

-- Profile query
EXPLAIN ANALYZE SELECT * FROM large_table WHERE condition;

-- Force garbage collection
CALL duckdb_gc();

Qdrant Issues

# Check collection info
info = client.get_collection("gopie_schemas")
print(f"Points: {info.points_count}")
print(f"Indexed: {info.indexed_vectors_count}")
print(f"Status: {info.status}")

# Optimize collection
client.update_collection(
    collection_name="gopie_schemas",
    optimizer_config=OptimizersConfig(
        indexing_threshold=10000
    )
)

Next Steps

Configure Storage
Set up AI Providers
Deploy with Docker
Implement Monitoring

Database Setup

On this page