Switch to backtracking Newton's method in Optim

This performs much better than the trust region Newton's method for the actual `circles-in-triangle` problem. (The trust region method performs better for the simplified problem produced by the conversion bug.)
Switch back to BigFloat precision in examples
2024-07-15 15:52:38 -07:00 · 2024-07-15 14:31:30 -07:00 · 2024-07-15 14:08:57 -07:00 · 2024-07-15 13:15:15 -07:00 · 2024-07-15 11:32:04 -07:00
4 changed files with 220 additions and 10 deletions
--- a/engine-proto/gram-test/Engine.jl
+++ b/engine-proto/gram-test/Engine.jl
@ -1,10 +1,14 @@
 module Engine

 using LinearAlgebra
+using GenericLinearAlgebra
 using SparseArrays
 using Random
+using Optim

-export rand_on_shell, Q, DescentHistory, realize_gram
+export
+  rand_on_shell, Q, DescentHistory,
+  realize_gram_gradient, realize_gram_newton, realize_gram_optim, realize_gram

 # === guessing ===

@ -76,8 +80,11 @@ end
 struct DescentHistory{T}
  scaled_loss::Array{T}
  neg_grad::Array{Matrix{T}}
+  base_step::Array{Matrix{T}}
+  hess::Array{Hermitian{T, Matrix{T}}}
  slope::Array{T}
  stepsize::Array{T}
+  positive::Array{Bool}
  backoff_steps::Array{Int64}
  last_line_L::Array{Matrix{T}}
  last_line_loss::Array{T}
@ -85,13 +92,16 @@ struct DescentHistory{T}
  function DescentHistory{T}(
    scaled_loss = Array{T}(undef, 0),
    neg_grad = Array{Matrix{T}}(undef, 0),
+    hess = Array{Hermitian{T, Matrix{T}}}(undef, 0),
+    base_step = Array{Matrix{T}}(undef, 0),
    slope = Array{T}(undef, 0),
    stepsize = Array{T}(undef, 0),
+    positive = Bool[],
    backoff_steps = Int64[],
    last_line_L = Array{Matrix{T}}(undef, 0),
    last_line_loss = Array{T}(undef, 0)
  ) where T
-    new(scaled_loss, neg_grad, slope, stepsize, backoff_steps, last_line_L, last_line_loss)
+    new(scaled_loss, neg_grad, hess, base_step, slope, stepsize, positive, backoff_steps, last_line_L, last_line_loss)
  end
 end

@ -101,7 +111,7 @@ function realize_gram_gradient(
  gram::SparseMatrixCSC{T, <:Any},
  guess::Matrix{T};
  scaled_tol = 1e-30,
-  target_improvement = 0.5,
+  min_efficiency = 0.5,
  init_stepsize = 1.0,
  backoff = 0.9,
  max_descent_steps = 600,
@ -152,7 +162,7 @@ function realize_gram_gradient(
      improvement = loss_last - loss
      push!(history.last_line_L, L)
      push!(history.last_line_loss, loss / scale_adjustment)
-      if improvement >= target_improvement * stepsize * slope
+      if improvement >= min_efficiency * stepsize * slope
        history.backoff_steps[end] = backoff_steps
        break
      end
@ -201,7 +211,7 @@ function realize_gram_newton(
  scale_adjustment = sqrt(T(length(constrained)))
  tol = scale_adjustment * scaled_tol
  
-  # use newton's method
+  # use Newton's method
  L = copy(guess)
  for step in 0:max_steps
    # evaluate the loss function
@ -229,8 +239,10 @@ function realize_gram_newton(
      deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj)
      hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
    end
+    hess = Hermitian(hess)
+    push!(history.hess, hess)
    
-    # compute the newton step
+    # compute the Newton step
    step = hess \ reshape(neg_grad, total_dim)
    L += rate * reshape(step, dims)
  end
@ -239,4 +251,168 @@ function realize_gram_newton(
  L, history
 end

+LinearAlgebra.eigen!(A::Symmetric{BigFloat, Matrix{BigFloat}}; sortby::Nothing) =
+  eigen!(Hermitian(A))
+
+function convertnz(type, mat)
+  J, K, values = findnz(mat)
+  sparse(J, K, type.(values))
+end
+
+function realize_gram_optim(
+  gram::SparseMatrixCSC{T, <:Any},
+  guess::Matrix{T}
+) where T <: Number
+  # find the dimension of the search space
+  dims = size(guess)
+  element_dim, construction_dim = dims
+  total_dim = element_dim * construction_dim
+  
+  # list the constrained entries of the gram matrix
+  J, K, _ = findnz(gram)
+  constrained = zip(J, K)
+  
+  # scale the loss function
+  scale_adjustment = length(constrained)
+  
+  function loss(L_vec)
+    L = reshape(L_vec, dims)
+    Δ_proj = proj_diff(gram, L'*Q*L)
+    dot(Δ_proj, Δ_proj) / scale_adjustment
+  end
+  
+  function loss_grad!(storage, L_vec)
+    L = reshape(L_vec, dims)
+    Δ_proj = proj_diff(gram, L'*Q*L)
+    storage .= reshape(-4*Q*L*Δ_proj, total_dim) / scale_adjustment
+  end
+  
+  function loss_hess!(storage, L_vec)
+    L = reshape(L_vec, dims)
+    Δ_proj = proj_diff(gram, L'*Q*L)
+    indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim]
+    for (j, k) in indices
+      basis_mat = basis_matrix(T, j, k, dims)
+      neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat
+      neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained)
+      deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj) / scale_adjustment
+      storage[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
+    end
+  end
+  
+  optimize(
+    loss, loss_grad!, loss_hess!,
+    reshape(guess, total_dim),
+    Newton()
+  )
+end
+
+# seek a matrix `L` for which `L'QL` matches the sparse matrix `gram` at every
+# explicit entry of `gram`. use gradient descent starting from `guess`
+function realize_gram(
+  gram::SparseMatrixCSC{T, <:Any},
+  guess::Matrix{T};
+  scaled_tol = 1e-30,
+  min_efficiency = 0.5,
+  init_rate = 1.0,
+  backoff = 0.9,
+  reg_scale = 1.1,
+  max_descent_steps = 200,
+  max_backoff_steps = 110
+) where T <: Number
+  # start history
+  history = DescentHistory{T}()
+  
+  # find the dimension of the search space
+  dims = size(guess)
+  element_dim, construction_dim = dims
+  total_dim = element_dim * construction_dim
+  
+  # list the constrained entries of the gram matrix
+  J, K, _ = findnz(gram)
+  constrained = zip(J, K)
+  
+  # scale the tolerance
+  scale_adjustment = sqrt(T(length(constrained)))
+  tol = scale_adjustment * scaled_tol
+  
+  # initialize variables
+  grad_rate = init_rate
+  L = copy(guess)
+  
+  # use Newton's method with backtracking and gradient descent backup
+  Δ_proj = proj_diff(gram, L'*Q*L)
+  loss = dot(Δ_proj, Δ_proj)
+  for step in 1:max_descent_steps
+    # stop if the loss is tolerably low
+    if loss < tol
+      break
+    end
+    
+    # find the negative gradient of loss function
+    neg_grad = 4*Q*L*Δ_proj
+    
+    # find the negative Hessian of the loss function
+    hess = Matrix{T}(undef, total_dim, total_dim)
+    indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim]
+    for (j, k) in indices
+      basis_mat = basis_matrix(T, j, k, dims)
+      neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat
+      neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained)
+      deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj)
+      hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
+    end
+    hess = Hermitian(hess)
+    push!(history.hess, hess)
+    
+    # regularize the Hessian
+    min_eigval = minimum(eigvals(hess))
+    push!(history.positive, min_eigval > 0)
+    if min_eigval <= 0
+      hess -= reg_scale * min_eigval * I
+    end
+    base_step = reshape(hess \ reshape(neg_grad, total_dim), dims)
+    push!(history.base_step, base_step)
+    
+    # store the current position, loss, and slope
+    L_last = L
+    loss_last = loss
+    push!(history.scaled_loss, loss / scale_adjustment)
+    push!(history.neg_grad, neg_grad)
+    push!(history.slope, norm(neg_grad))
+    
+    # find a good step size using backtracking line search
+    push!(history.stepsize, 0)
+    push!(history.backoff_steps, max_backoff_steps)
+    empty!(history.last_line_L)
+    empty!(history.last_line_loss)
+    rate = one(T)
+    step_success = false
+    for backoff_steps in 0:max_backoff_steps
+      history.stepsize[end] = rate
+      L = L_last + rate * base_step
+      Δ_proj = proj_diff(gram, L'*Q*L)
+      loss = dot(Δ_proj, Δ_proj)
+      improvement = loss_last - loss
+      push!(history.last_line_L, L)
+      push!(history.last_line_loss, loss / scale_adjustment)
+      if improvement >= min_efficiency * rate * dot(neg_grad, base_step)
+        history.backoff_steps[end] = backoff_steps
+        step_success = true
+        break
+      end
+      rate *= backoff
+    end
+    
+    # if we've hit a wall, quit
+    if !step_success
+      return L_last, false, history
+    end
+  end
+  
+  # return the factorization and its history
+  push!(history.scaled_loss, loss / scale_adjustment)
+  L, true, history
+end
+
 end
--- a/engine-proto/gram-test/circles-in-triangle.jl
+++ b/engine-proto/gram-test/circles-in-triangle.jl
@ -81,16 +81,28 @@ guess = hcat(
 =#

 # complete the gram matrix using gradient descent followed by Newton's method
+#=
 L, history = Engine.realize_gram_gradient(gram, guess, scaled_tol = 0.01)
 L_pol, history_pol = Engine.realize_gram_newton(gram, L, rate = 0.3, scaled_tol = 1e-9)
 L_pol2, history_pol2 = Engine.realize_gram_newton(gram, L_pol)
+=#
+L, success, history = Engine.realize_gram(gram, guess)
 completed_gram = L'*Engine.Q*L
 println("Completed Gram matrix:\n")
 display(completed_gram)
+#=
 println(
  "\nSteps: ",
  size(history.scaled_loss, 1),
  " + ", size(history_pol.scaled_loss, 1),
  " + ", size(history_pol2.scaled_loss, 1)
 )
-println("Loss: ", history_pol2.scaled_loss[end], "\n")
+println("Loss: ", history_pol2.scaled_loss[end], "\n")
+=#
+if success
+  println("\nTarget accuracy achieved!")
+else
+  println("\nFailed to reach target accuracy")
+end
+println("Steps: ", size(history.scaled_loss, 1))
+println("Loss: ", history.scaled_loss[end], "\n")
--- a/engine-proto/gram-test/overlapping-pyramids.jl
+++ b/engine-proto/gram-test/overlapping-pyramids.jl
@ -47,17 +47,30 @@ guess = hcat(
  Engine.rand_on_shell(fill(BigFloat(-1), 2))
 )

-# complete the gram matrix using gradient descent followed by Newton's method
+# complete the gram matrix
+#=
 L, history = Engine.realize_gram_gradient(gram, guess, scaled_tol = 0.01)
 L_pol, history_pol = Engine.realize_gram_newton(gram, L)
+=#
+L, success, history = Engine.realize_gram(gram, guess)
 completed_gram = L'*Engine.Q*L
 println("Completed Gram matrix:\n")
 display(completed_gram)
+#=
 println("\nSteps: ", size(history.scaled_loss, 1), " + ", size(history_pol.scaled_loss, 1))
 println("Loss: ", history_pol.scaled_loss[end], "\n")
+=#
+if success
+  println("\nTarget accuracy achieved!")
+else
+  println("\nFailed to reach target accuracy")
+end
+println("Steps: ", size(history.scaled_loss, 1))
+println("Loss: ", history.scaled_loss[end], "\n")

 # === algebraic check ===

+#=
 R, gens = polynomial_ring(AbstractAlgebra.Rationals{BigInt}(), ["x", "t₁", "t₂", "t₃"])
 x = gens[1]
 t = gens[2:4]
@ -85,3 +98,4 @@ x_constraint = 25//16 * to_univariate(S, evaluate(rank_constraints[1], [2], [ind
 t₂_constraint = 25//16 * to_univariate(S, evaluate(rank_constraints[3], [2], [indep_val]))
 x_vals = PolynomialRoots.roots(x_constraint.coeffs)
 t₂_vals = PolynomialRoots.roots(t₂_constraint.coeffs)
+=#
--- a/engine-proto/gram-test/sphere-in-tetrahedron.jl
+++ b/engine-proto/gram-test/sphere-in-tetrahedron.jl
@ -33,10 +33,18 @@ guess = sqrt(1/BigFloat(3)) * BigFloat[
  1  1  1  1  1
 ] + 0.2*Engine.rand_on_shell(fill(BigFloat(-1), 5))

-# complete the gram matrix using Newton's method
+# complete the gram matrix
+#=
 L, history = Engine.realize_gram_newton(gram, guess)
+=#
+L, success, history = Engine.realize_gram(gram, guess)
 completed_gram = L'*Engine.Q*L
 println("Completed Gram matrix:\n")
 display(completed_gram)
-println("\nSteps: ", size(history.scaled_loss, 1))
+if success
+  println("\nTarget accuracy achieved!")
+else
+  println("\nFailed to reach target accuracy")
+end
+println("Steps: ", size(history.scaled_loss, 1))
 println("Loss: ", history.scaled_loss[end], "\n")
Author	SHA1	Message	Date
Aaron Fenyes	b185fd4b83	Switch to backtracking Newton's method in Optim This performs much better than the trust region Newton's method for the actual `circles-in-triangle` problem. (The trust region method performs better for the simplified problem produced by the conversion bug.)	2024-07-15 15:52:38 -07:00
Aaron Fenyes	94e0d321d5	Switch back to BigFloat precision in examples	2024-07-15 14:31:30 -07:00
Aaron Fenyes	53d8c38047	Preserve explicit zeros in Gram matrix conversion In previous commits, the `circles-in-triangle` example converged much more slowly in BigFloat precision than in Float64 precision. This turned out to be a sign of a bug in the Float64 computation: converting the Gram matrix using `Float64.()` dropped the explicit zeros, removing many constraints and making the problem much easier to solve. This commit corrects the Gram matrix conversion. The Float64 search now solves the same problem as the BigFloat search, with comparable performance.	2024-07-15 14:08:57 -07:00
Aaron Fenyes	7b3efbc385	Clean up backtracking gradient descent code Drop experimental singularity handling strategies. Reduce the default tolerance to within 64-bit floating point precision. Report success.	2024-07-15 13:15:15 -07:00
Aaron Fenyes	25b09ebf92	Sketch backtracking Newton's method This code is a mess, but I'm committing it to record a working state before I start trying to clean up.	2024-07-15 11:32:04 -07:00