Complex Join Patterns Guide

Copy Markdown

This guide provides comprehensive examples and patterns for implementing complex joins using Selecto's advanced join infrastructure.

Table of Contents

Star Schema Dimensions

Star schema dimensions are optimized for analytical queries, providing denormalized access to dimension data with automatic display field handling.

Basic Star Dimension Setup

# Domain configuration for star schema
star_domain = %{
  name: "Analytics Star Schema",
  source: %{
    source_table: "sales_facts",
    primary_key: :id,
    fields: [:id, :sale_amount, :customer_id, :product_id, :date_id],
    columns: %{
      id: %{type: :integer},
      sale_amount: %{type: :decimal},
      customer_id: %{type: :integer},
      product_id: %{type: :integer},
      date_id: %{type: :integer}
    },
    associations: %{
      customer: %{queryable: :customers, field: :customer, owner_key: :customer_id, related_key: :id},
      product: %{queryable: :products, field: :product, owner_key: :product_id, related_key: :id},
      date: %{queryable: :dates, field: :date, owner_key: :date_id, related_key: :id}
    }
  },
  schemas: %{
    customers: %{
      name: "Customer Dimension",
      source_table: "customer_dim",
      fields: [:id, :name, :segment, :region, :country],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        segment: %{type: :string},
        region: %{type: :string},
        country: %{type: :string}
      }
    },
    products: %{
      name: "Product Dimension",
      source_table: "product_dim",
      fields: [:id, :name, :category, :subcategory, :brand],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        category: %{type: :string},
        subcategory: %{type: :string},
        brand: %{type: :string}
      }
    },
    dates: %{
      name: "Date Dimension",
      source_table: "date_dim",
      fields: [:id, :date_value, :year, :quarter, :month, :day_name],
      columns: %{
        id: %{type: :integer},
        date_value: %{type: :date},
        year: %{type: :integer},
        quarter: %{type: :integer},
        month: %{type: :integer},
        day_name: %{type: :string}
      }
    }
  },
  joins: %{
    # Star dimensions with automatic display field resolution
    customer: %{type: :star_dimension, display_field: :name},
    product: %{type: :star_dimension, display_field: :name},
    date: %{type: :star_dimension, display_field: :date_value}
  }
}

# Configure and query
selecto = Selecto.configure(star_domain, conn)

# Query with star dimension fields
sales_by_region = selecto
  |> Selecto.select([
    "customer_display",          # Automatic display field (customer.name)
    "customer.segment",         # Dimension attribute
    "customer.region",          # Dimension attribute
    "product_display",           # Automatic display field (product.name)
    "product.category",         # Dimension attribute
    "date.year",               # Time dimension
    {:func, "sum", ["sale_amount"]},
    {:func, "count", ["*"]}
  ])
  |> Selecto.filter([
    {"date.year", 2024},
    {"customer.segment", {:in, ["Premium", "Enterprise"]}},
    {"product.category", {:not_eq, "Discontinued"}}
  ])
  |> Selecto.group_by([
    "customer.region",
    "product.category", 
    "date.year"
  ])
  |> Selecto.execute()

Nested Star Dimensions

# Extended star schema with snowflake dimension normalization
nested_star_domain = %{
  # ... base configuration ...
  schemas: %{
    # ... existing schemas ...
    regions: %{
      name: "Region Dimension",
      source_table: "region_dim",
      fields: [:id, :name, :country_id],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        country_id: %{type: :integer}
      },
      associations: %{
        country: %{queryable: :countries, field: :country, owner_key: :country_id, related_key: :id}
      }
    },
    countries: %{
      name: "Country Dimension",
      source_table: "country_dim", 
      fields: [:id, :name, :continent],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        continent: %{type: :string}
      }
    }
  },
  joins: %{
    customer: %{
      type: :star_dimension,
      display_field: :name,
      joins: %{
        # Snowflake pattern: dimension joins to other dimensions
        region: %{
          type: :star_dimension,
          display_field: :name,
          joins: %{
            country: %{type: :star_dimension, display_field: :name}
          }
        }
      }
    }
  }
}

# Query with nested star dimensions
geographic_analysis = selecto
  |> Selecto.select([
    "customer.region.country_display",  # Nested dimension display
    "customer.region.country.continent", # Deep dimension attribute
    {:func, "sum", ["sale_amount"]},
    {:func, "count", ["DISTINCT", "customer_id"]}
  ])
  |> Selecto.filter([
    {"customer.region.country.continent", {:in, ["North America", "Europe"]}},
    {"sale_amount", {:gt, 1000}}
  ])
  |> Selecto.group_by([
    "customer.region.country_display",
    "customer.region.country.continent"
  ])
  |> Selecto.execute()

Hierarchical Joins

Hierarchical joins handle tree-like data structures using either adjacency lists or materialized paths, with automatic CTE generation for traversal.

Adjacency List Hierarchies

# Domain with adjacency list hierarchy
hierarchy_domain = %{
  name: "Organizational Hierarchy",
  source: %{
    source_table: "employees",
    primary_key: :id,
    fields: [:id, :name, :position, :department_id, :manager_id],
    columns: %{
      id: %{type: :integer},
      name: %{type: :string}, 
      position: %{type: :string},
      department_id: %{type: :integer},
      manager_id: %{type: :integer}
    },
    associations: %{
      manager: %{queryable: :employees, field: :manager, owner_key: :manager_id, related_key: :id},
      department: %{queryable: :departments, field: :department, owner_key: :department_id, related_key: :id}
    }
  },
  schemas: %{
    departments: %{
      name: "Department",
      source_table: "departments",
      fields: [:id, :name, :parent_id],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        parent_id: %{type: :integer}
      }
    }
  },
  joins: %{
    # Self-referencing hierarchy (management chain)
    manager: %{
      type: :hierarchical,
      hierarchy_type: :adjacency_list,
      depth_limit: 6,
      path_separator: " > "
    },
    # Department hierarchy  
    department: %{
      type: :hierarchical,
      hierarchy_type: :adjacency_list,
      depth_limit: 4
    }
  }
}

selecto = Selecto.configure(hierarchy_domain, conn)

# Query management hierarchy
management_analysis = selecto
  |> Selecto.select([
    "name",
    "position",
    "manager_path",           # Full path to CEO: "CEO > VP > Director > Manager"
    "manager_level",          # Depth in hierarchy (0 = CEO, 1 = VP, etc.)
    "manager_path_array",     # Array of manager IDs in path
    "department.name",
    "department_path",        # Department hierarchy path
    "department_level"        # Department depth
  ])
  |> Selecto.filter([
    {"manager_level", {:between, 2, 4}},      # Middle management layers
    {"department_level", {:lte, 2}},          # Top 2 department levels
    {"position", {:like, "%Manager%"}}
  ])
  |> Selecto.order_by([
    "department_level",
    "manager_level", 
    "name"
  ])
  |> Selecto.execute()

Materialized Path Hierarchies

# Domain with materialized path hierarchy (more efficient for deep trees)
path_hierarchy_domain = %{
  name: "Content Categories",
  source: %{
    source_table: "articles",
    primary_key: :id,
    fields: [:id, :title, :content, :category_id],
    columns: %{
      id: %{type: :integer},
      title: %{type: :string},
      content: %{type: :text},
      category_id: %{type: :integer}
    },
    associations: %{
      category: %{queryable: :categories, field: :category, owner_key: :category_id, related_key: :id}
    }
  },
  schemas: %{
    categories: %{
      name: "Category",
      source_table: "categories",
      fields: [:id, :name, :path, :parent_id],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string}, 
        path: %{type: :string},    # e.g., "/technology/web-dev/frontend"
        parent_id: %{type: :integer}
      }
    }
  },
  joins: %{
    category: %{
      type: :hierarchical,
      hierarchy_type: :materialized_path,
      path_field: :path,
      path_separator: "/",
      depth_limit: 8
    }
  }
}

selecto = Selecto.configure(path_hierarchy_domain, conn)

# Query with path-based hierarchy
category_analysis = selecto
  |> Selecto.select([
    "title",
    "category.name",
    "category_path",          # Full category path
    "category_level",         # Depth (calculated from path)
    "category_ancestors",     # Array of ancestor category names
    {:func, "count", ["*"]}
  ])
  |> Selecto.filter([
    # Find all articles in "Technology" branch (any level)
    {"category_path", {:like, "%/technology/%"}},
    # Or specific depth
    {"category_level", {:eq, 3}}
  ])
  |> Selecto.group_by([
    "category.name", 
    "category_path",
    "category_level"
  ])
  |> Selecto.execute()

Many-to-Many Tagging

Many-to-many tagging joins provide faceted filtering and tag aggregation for flexible categorization systems.

Basic Tagging Setup

# Domain with many-to-many tagging
tagging_domain = %{
  name: "Content Tagging System",
  source: %{
    source_table: "articles",
    primary_key: :id,
    fields: [:id, :title, :content, :author_id, :published_at],
    columns: %{
      id: %{type: :integer},
      title: %{type: :string},
      content: %{type: :text}, 
      author_id: %{type: :integer},
      published_at: %{type: :utc_datetime}
    },
    associations: %{
      author: %{queryable: :users, field: :author, owner_key: :author_id, related_key: :id},
      tags: %{queryable: :article_tags, field: :tags, owner_key: :id, related_key: :article_id}
    }
  },
  schemas: %{
    users: %{
      name: "User",
      source_table: "users",
      fields: [:id, :name, :email],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        email: %{type: :string}
      }
    },
    article_tags: %{
      name: "Article Tag",
      source_table: "article_tags",
      fields: [:id, :name, :article_id, :weight],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        article_id: %{type: :integer},
        weight: %{type: :integer}  # Tag relevance weight
      }
    }
  },
  joins: %{
    author: %{type: :left, display_field: :name},
    tags: %{
      type: :tagging,
      tag_field: :name,
      weight_field: :weight,     # Optional: for weighted tagging
      aggregation: :string_agg   # How to combine multiple tags
    }
  }
}

selecto = Selecto.configure(tagging_domain, conn)

# Query with tag filtering and aggregation
tagged_content = selecto
  |> Selecto.select([
    "title",
    "author_display",
    "tags_list",              # Aggregated tag string: "tech,web,frontend" 
    "tags_count",             # Number of tags
    "tags_weight_sum",        # Sum of tag weights
    "published_at"
  ])
  |> Selecto.filter([
    # Faceted tag filtering
    {"tags_filter", "technology"},        # Has "technology" tag
    {"tags_any", ["web", "mobile"]},      # Has any of these tags
    {"tags_all", ["frontend", "react"]},  # Has all of these tags
    {"tags_weight_min", 5},               # Minimum tag weight
    {"published_at", {:gte, ~D[2024-01-01]}}
  ])
  |> Selecto.order_by([
    {:desc, "tags_weight_sum"},  # Order by tag relevance
    {:desc, "published_at"}
  ])
  |> Selecto.execute()

Advanced Tagging Patterns

# Multi-dimensional tagging (categories + user tags)
multi_tag_domain = %{
  # ... base configuration ...
  associations: %{
    # ... existing associations ...
    user_tags: %{queryable: :user_article_tags, field: :user_tags, owner_key: :id, related_key: :article_id},
    system_tags: %{queryable: :system_tags, field: :system_tags, owner_key: :id, related_key: :article_id}
  },
  schemas: %{
    # ... existing schemas ...
    user_article_tags: %{
      name: "User Tag",
      source_table: "user_article_tags", 
      fields: [:id, :name, :article_id, :user_id, :created_at],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        article_id: %{type: :integer},
        user_id: %{type: :integer},
        created_at: %{type: :utc_datetime}
      }
    },
    system_tags: %{
      name: "System Tag",
      source_table: "system_tags",
      fields: [:id, :name, :article_id, :confidence],
      columns: %{
        id: %{type: :integer},
        name: %{type: :string},
        article_id: %{type: :integer},
        confidence: %{type: :float}  # AI-generated tag confidence
      }
    }
  },
  joins: %{
    # ... existing joins ...
    user_tags: %{
      type: :tagging,
      tag_field: :name,
      aggregation: :array_agg    # Keep as array for complex filtering
    },
    system_tags: %{
      type: :tagging,
      tag_field: :name,
      weight_field: :confidence,
      min_weight: 0.7           # Only high-confidence system tags
    }
  }
}

# Query with multiple tag dimensions
multi_dimensional_tags = selecto
  |> Selecto.select([
    "title",
    "user_tags_array",        # User-generated tags as array
    "system_tags_list",       # AI tags as string (high confidence only)
    "system_tags_avg_confidence", # Average AI confidence
    {:func, "count", ["DISTINCT", "user_tags.user_id"]} # Unique taggers
  ])
  |> Selecto.filter([
    {"user_tags_count", {:gte, 3}},           # At least 3 user tags
    {"system_tags_filter", "programming"},    # AI detected "programming"
    {"system_tags_avg_confidence", {:gte, 0.8}} # High AI confidence
  ])
  |> Selecto.group_by(["id", "title"])
  |> Selecto.execute()

Self-Referencing Hierarchies

Self-referencing hierarchies handle cases where entities reference other entities of the same type, such as management structures or threaded comments.

Comment Threading

# Domain for threaded comments
comment_domain = %{
  name: "Threaded Comments",
  source: %{
    source_table: "comments",
    primary_key: :id,
    fields: [:id, :content, :author_id, :parent_id, :article_id, :created_at],
    columns: %{
      id: %{type: :integer},
      content: %{type: :text},
      author_id: %{type: :integer},
      parent_id: %{type: :integer},    # Self-reference
      article_id: %{type: :integer},
      created_at: %{type: :utc_datetime}
    },
    associations: %{
      parent: %{queryable: :comments, field: :parent, owner_key: :parent_id, related_key: :id},
      author: %{queryable: :users, field: :author, owner_key: :author_id, related_key: :id},
      article: %{queryable: :articles, field: :article, owner_key: :article_id, related_key: :id}
    }
  },
  # ... schemas for users, articles ...
  joins: %{
    parent: %{
      type: :hierarchical,
      hierarchy_type: :adjacency_list,
      depth_limit: 10,  # Limit thread depth
      self_reference: true
    },
    author: %{type: :left, display_field: :name},
    article: %{type: :left, display_field: :title}
  }
}

selecto = Selecto.configure(comment_domain, conn)

# Query comment threads with hierarchy
comment_threads = selecto
  |> Selecto.select([
    "content",
    "author_display",
    "parent_path",            # Path to root comment
    "parent_level",           # Thread depth (0 = root, 1 = reply, etc.)
    "thread_root_id",         # ID of root comment in thread
    "created_at"
  ])
  |> Selecto.filter([
    {"article_id", {:eq, 123}},
    {"parent_level", {:lte, 5}}  # Limit thread depth for display
  ])
  |> Selecto.order_by([
    "thread_root_id",         # Group by thread
    "parent_level",           # Order by depth
    "created_at"              # Then by time
  ])
  |> Selecto.execute()

Organizational Reporting

# Query direct reports and team sizes
org_structure = selecto
  |> Selecto.select([
    "name",
    "position", 
    "manager_display",        # Direct manager name
    "manager_level",          # Levels from CEO
    {:func, "count", ["subordinates.id"]}, # Direct reports count
    "team_size_total",        # Total team size (all levels down)
    "manager_path"            # Full reporting chain to CEO
  ])
  |> Selecto.filter([
    {"manager_level", {:between, 1, 4}},  # Skip CEO, limit depth
    {"active", true}
  ])
  |> Selecto.group_by([
    "id", "name", "position", 
    "manager_display", "manager_level"
  ])
  |> Selecto.order_by([
    "manager_level",
    {:desc, {:func, "count", ["subordinates.id"]}}
  ])
  |> Selecto.execute()

Mixed Join Patterns

Complex domains often combine multiple join patterns for comprehensive data relationships.

E-commerce with Mixed Patterns

# Domain combining star dimensions, hierarchies, and tagging
ecommerce_mixed = %{
  name: "E-commerce Analytics",
  source: %{
    source_table: "orders",
    primary_key: :id,
    fields: [:id, :total, :customer_id, :created_at, :status],
    # ... columns ...
    associations: %{
      customer: %{queryable: :customers, field: :customer, owner_key: :customer_id, related_key: :id},
      items: %{queryable: :order_items, field: :items, owner_key: :id, related_key: :order_id}
    }
  },
  schemas: %{
    customers: %{
      # ... customer schema with region association ...
    },
    order_items: %{
      # ... order item schema with product association ...
    },
    products: %{
      # ... product schema with category and tags associations ...
    },
    categories: %{
      # ... hierarchical categories with path field ...
    }
  },
  joins: %{
    # Star dimension for customer analytics
    customer: %{
      type: :star_dimension,
      display_field: :name,
      joins: %{
        region: %{type: :star_dimension, display_field: :name}
      }
    },
    # Left join to order items with nested patterns
    items: %{
      type: :left,
      joins: %{
        # Star dimension for product analytics
        product: %{
          type: :star_dimension,
          display_field: :name,
          joins: %{
            # Hierarchical category structure
            category: %{
              type: :hierarchical,
              hierarchy_type: :materialized_path,
              path_field: :path,
              path_separator: "/"
            },
            # Many-to-many product tags
            tags: %{type: :tagging, tag_field: :name}
          }
        }
      }
    }
  }
}

selecto = Selecto.configure(ecommerce_mixed, conn)

# Complex query using all join patterns
comprehensive_analysis = selecto
  |> Selecto.select([
    # Star dimension fields
    "customer_display",
    "customer.region_display",
    
    # Hierarchical category analysis
    "items.product.category_path",
    "items.product.category_level",
    
    # Tagging analysis
    "items.product.tags_list",
    "items.product.tags_count",
    
    # Aggregated measures
    {:func, "sum", ["total"]},
    {:func, "avg", ["items.quantity"]},
    {:func, "count", ["DISTINCT", "items.product_id"]}
  ])
  |> Selecto.filter([
    # Star dimension filters
    {"customer.region.name", {:in, ["North America", "Europe"]}},
    
    # Hierarchical filters
    {"items.product.category_level", {:lte, 3}},
    {"items.product.category_path", {:like, "%electronics%"}},
    
    # Tagging filters
    {"items.product.tags_filter", "premium"},
    {"items.product.tags_count", {:gte, 2}},
    
    # Standard filters
    {"status", "completed"},
    {"created_at", {:gte, ~D[2024-01-01]}}
  ])
  |> Selecto.group_by([
    "customer.region_display",
    "items.product.category_path",
    "items.product.category_level"
  ])
  |> Selecto.order_by([
    {:desc, {:func, "sum", ["total"]}},
    "items.product.category_level"
  ])
  |> Selecto.execute()

Performance Considerations

Join Ordering Optimization

# Optimal join order for performance
def optimize_join_order(selecto) do
  selecto
  # 1. Apply most selective filters first (reduces dataset early)
  |> Selecto.filter([
    {"created_at", {:between, ~D[2024-01-01], ~D[2024-03-31]}}, # Date range
    {"status", "active"},                                        # Selective filter
    {"customer.segment", "enterprise"}                          # Dimension filter
  ])
  # 2. Select specific fields (avoid SELECT *)
  |> Selecto.select([
    "customer_display",
    "items.product.category_path",
    {:func, "sum", ["total"]}
  ])
  # 3. Group by dimension fields only
  |> Selecto.group_by(["customer_display", "items.product.category_path"])
  # 4. Order by aggregated values for efficient top-N
  |> Selecto.order_by([{:desc, {:func, "sum", ["total"]}}])
  |> Selecto.limit(100)
end

Index Recommendations

-- Recommended indexes for join patterns

-- Star dimension joins
CREATE INDEX idx_orders_customer_date ON orders(customer_id, created_at);
CREATE INDEX idx_customers_segment_region ON customers(segment, region_id);

-- Hierarchical joins
CREATE INDEX idx_categories_parent_path ON categories(parent_id, path);
CREATE INDEX idx_employees_manager_level ON employees(manager_id) WHERE active = true;

-- Tagging joins  
CREATE INDEX idx_product_tags_product_name ON product_tags(product_id, name);
CREATE INDEX idx_article_tags_name_weight ON article_tags(name, weight) WHERE weight >= 0.5;

-- Composite indexes for common filter combinations
CREATE INDEX idx_orders_status_date_customer ON orders(status, created_at, customer_id);

Advanced CTE Integration

Custom CTE with Join Patterns

alias Selecto.Advanced.CTE
alias Selecto.Builder.CteSql

# Build custom CTE that leverages join patterns
def build_customer_segment_analysis(conn) do
  # Base selecto with join patterns
  base_selecto = Selecto.configure(ecommerce_mixed, conn)
    |> Selecto.select([
      "customer_id",
      "customer.segment", 
      "customer.region_display",
      "items.product.category_level",
      {:func, "sum", ["total"]},
      {:func, "count", ["*"]}
    ])
    |> Selecto.filter([
      {"created_at", {:gte, ~D[2024-01-01]}},
      {"status", "completed"}
    ])
    |> Selecto.group_by([
      "customer_id",
      "customer.segment",
      "customer.region_display", 
      "items.product.category_level"
    ])
  
  # Convert to CTE spec
  customer_cte = CTE.create_cte("customer_analysis", fn -> base_selecto end)
  
  # Main query using the CTE
  main_query = [
    "SELECT ",
      "customer_segment, region_display, category_level, ",
      "SUM(total_sales) as segment_sales, ",
      "AVG(order_count) as avg_orders_per_customer, ",
      "COUNT(DISTINCT customer_id) as customer_count ",
    "FROM customer_analysis ",
    "GROUP BY customer_segment, region_display, category_level ",
    "ORDER BY segment_sales DESC"
  ]
  
  {complete_query, _combined_params} = CteSql.integrate_ctes_with_query(
    [customer_cte],
    main_query,
    []
  )
  
  {sql, final_params} = Selecto.SQL.Params.finalize(complete_query)
  Postgrex.query(conn, sql, final_params)
end

This comprehensive guide demonstrates how to effectively use Selecto's complex join patterns for real-world data analysis scenarios. Each pattern is optimized for specific use cases and can be combined to handle sophisticated data relationships.