From 25b09ebf925bc9a03f3f52014b7d0a8a3693b12e Mon Sep 17 00:00:00 2001 From: Aaron Fenyes Date: Mon, 15 Jul 2024 11:32:04 -0700 Subject: [PATCH] Sketch backtracking Newton's method This code is a mess, but I'm committing it to record a working state before I start trying to clean up. --- engine-proto/gram-test/Engine.jl | 237 +++++++++++++++++- engine-proto/gram-test/circles-in-triangle.jl | 9 +- .../gram-test/overlapping-pyramids.jl | 11 +- .../gram-test/sphere-in-tetrahedron.jl | 5 +- 4 files changed, 254 insertions(+), 8 deletions(-) diff --git a/engine-proto/gram-test/Engine.jl b/engine-proto/gram-test/Engine.jl index e6f0d97..78d1409 100644 --- a/engine-proto/gram-test/Engine.jl +++ b/engine-proto/gram-test/Engine.jl @@ -1,8 +1,10 @@ module Engine using LinearAlgebra +using GenericLinearAlgebra using SparseArrays using Random +using Optim export rand_on_shell, Q, DescentHistory, realize_gram @@ -76,8 +78,11 @@ end struct DescentHistory{T} scaled_loss::Array{T} neg_grad::Array{Matrix{T}} + base_step::Array{Matrix{T}} + hess::Array{Hermitian{T, Matrix{T}}} slope::Array{T} stepsize::Array{T} + used_grad::Array{Bool} backoff_steps::Array{Int64} last_line_L::Array{Matrix{T}} last_line_loss::Array{T} @@ -85,13 +90,16 @@ struct DescentHistory{T} function DescentHistory{T}( scaled_loss = Array{T}(undef, 0), neg_grad = Array{Matrix{T}}(undef, 0), + hess = Array{Hermitian{T, Matrix{T}}}(undef, 0), + base_step = Array{Matrix{T}}(undef, 0), slope = Array{T}(undef, 0), stepsize = Array{T}(undef, 0), + used_grad = Bool[], backoff_steps = Int64[], last_line_L = Array{Matrix{T}}(undef, 0), last_line_loss = Array{T}(undef, 0) ) where T - new(scaled_loss, neg_grad, slope, stepsize, backoff_steps, last_line_L, last_line_loss) + new(scaled_loss, neg_grad, hess, base_step, slope, stepsize, used_grad, backoff_steps, last_line_L, last_line_loss) end end @@ -101,7 +109,7 @@ function realize_gram_gradient( gram::SparseMatrixCSC{T, <:Any}, guess::Matrix{T}; scaled_tol = 1e-30, - target_improvement = 0.5, + min_efficiency = 0.5, init_stepsize = 1.0, backoff = 0.9, max_descent_steps = 600, @@ -152,7 +160,7 @@ function realize_gram_gradient( improvement = loss_last - loss push!(history.last_line_L, L) push!(history.last_line_loss, loss / scale_adjustment) - if improvement >= target_improvement * stepsize * slope + if improvement >= min_efficiency * stepsize * slope history.backoff_steps[end] = backoff_steps break end @@ -201,7 +209,7 @@ function realize_gram_newton( scale_adjustment = sqrt(T(length(constrained))) tol = scale_adjustment * scaled_tol - # use newton's method + # use Newton's method L = copy(guess) for step in 0:max_steps # evaluate the loss function @@ -229,8 +237,10 @@ function realize_gram_newton( deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj) hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim) end + hess = Hermitian(hess) + push!(history.hess, hess) - # compute the newton step + # compute the Newton step step = hess \ reshape(neg_grad, total_dim) L += rate * reshape(step, dims) end @@ -239,4 +249,221 @@ function realize_gram_newton( L, history end +LinearAlgebra.eigen!(A::Symmetric{BigFloat, Matrix{BigFloat}}; sortby::Nothing) = + eigen!(Hermitian(A)) + +function realize_gram_optim( + gram::SparseMatrixCSC{T, <:Any}, + guess::Matrix{T} +) where T <: Number + # find the dimension of the search space + dims = size(guess) + element_dim, construction_dim = dims + total_dim = element_dim * construction_dim + + # list the constrained entries of the gram matrix + J, K, _ = findnz(gram) + constrained = zip(J, K) + + # scale the loss function + scale_adjustment = length(constrained) + + function loss(L_vec) + L = reshape(L_vec, dims) + Δ_proj = proj_diff(gram, L'*Q*L) + dot(Δ_proj, Δ_proj) / scale_adjustment + end + + function loss_grad!(storage, L_vec) + L = reshape(L_vec, dims) + Δ_proj = proj_diff(gram, L'*Q*L) + storage .= reshape(-4*Q*L*Δ_proj, total_dim) / scale_adjustment + end + + function loss_hess!(storage, L_vec) + L = reshape(L_vec, dims) + Δ_proj = proj_diff(gram, L'*Q*L) + indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim] + for (j, k) in indices + basis_mat = basis_matrix(T, j, k, dims) + neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat + neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained) + deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj) / scale_adjustment + storage[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim) + end + end + + optimize( + loss, loss_grad!, loss_hess!, + reshape(guess, total_dim), + NewtonTrustRegion() + ) +end + +# seek a matrix `L` for which `L'QL` matches the sparse matrix `gram` at every +# explicit entry of `gram`. use gradient descent starting from `guess` +function realize_gram( + gram::SparseMatrixCSC{T, <:Any}, + guess::Matrix{T}; + scaled_tol = 1e-30, + min_efficiency = 0.5, + init_rate = 1.0, + backoff = 0.9, + reg_scale = 1.1, + max_descent_steps = 200, + max_backoff_steps = 110 +) where T <: Number + # start history + history = DescentHistory{T}() + + # find the dimension of the search space + dims = size(guess) + element_dim, construction_dim = dims + total_dim = element_dim * construction_dim + + # list the constrained entries of the gram matrix + J, K, _ = findnz(gram) + constrained = zip(J, K) + + # scale the tolerance + scale_adjustment = sqrt(T(length(constrained))) + tol = scale_adjustment * scaled_tol + + # initialize variables + grad_rate = init_rate + L = copy(guess) + + # use Newton's method with backtracking and gradient descent backup + Δ_proj = proj_diff(gram, L'*Q*L) + loss = dot(Δ_proj, Δ_proj) + for step in 1:max_descent_steps + # stop if the loss is tolerably low + if loss < tol + break + end + + # find the negative gradient of loss function + neg_grad = 4*Q*L*Δ_proj + + # find the negative Hessian of the loss function + hess = Matrix{T}(undef, total_dim, total_dim) + indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim] + for (j, k) in indices + basis_mat = basis_matrix(T, j, k, dims) + neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat + neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained) + deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj) + hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim) + end + hess = Hermitian(hess) + push!(history.hess, hess) + + # choose a base step: the Newton step if the Hessian is non-singular, and + # the gradient descent direction otherwise + #= + sing = false + base_step = try + reshape(hess \ reshape(neg_grad, total_dim), dims) + catch ex + if isa(ex, SingularException) + sing = true + normalize(neg_grad) + else + throw(ex) + end + end + =# + #= + if !sing + rate = one(T) + end + =# + #= + if cond(Float64.(hess)) < 1e5 + sing = false + base_step = reshape(hess \ reshape(neg_grad, total_dim), dims) + else + sing = true + base_step = normalize(neg_grad) + end + =# + #= + if cond(Float64.(hess)) > 1e3 + sing = true + hess += big"1e-5"*I + else + sing = false + end + base_step = reshape(hess \ reshape(neg_grad, total_dim), dims) + =# + min_eigval = minimum(eigvals(hess)) + if min_eigval < 0 + hess -= reg_scale * min_eigval * I + end + push!(history.used_grad, false) + base_step = reshape(hess \ reshape(neg_grad, total_dim), dims) + push!(history.base_step, base_step) + #= + push!(history.used_grad, sing) + =# + + # store the current position, loss, and slope + L_last = L + loss_last = loss + push!(history.scaled_loss, loss / scale_adjustment) + push!(history.neg_grad, neg_grad) + push!(history.slope, norm(neg_grad)) + + # find a good step size using backtracking line search + push!(history.stepsize, 0) + push!(history.backoff_steps, max_backoff_steps) + empty!(history.last_line_L) + empty!(history.last_line_loss) + rate = one(T) + for backoff_steps in 0:max_backoff_steps + history.stepsize[end] = rate + + # try Newton step, but not on the first step. doing at least one step of + # gradient descent seems to help prevent getting stuck, for some reason? + if step > 0 + L = L_last + rate * base_step + Δ_proj = proj_diff(gram, L'*Q*L) + loss = dot(Δ_proj, Δ_proj) + improvement = loss_last - loss + push!(history.last_line_L, L) + push!(history.last_line_loss, loss / scale_adjustment) + if improvement >= min_efficiency * rate * dot(neg_grad, base_step) + history.backoff_steps[end] = backoff_steps + break + end + end + + # try gradient descent step + slope = norm(neg_grad) + dir = neg_grad / slope + L = L_last + rate * grad_rate * dir + Δ_proj = proj_diff(gram, L'*Q*L) + loss = dot(Δ_proj, Δ_proj) + improvement = loss_last - loss + if improvement >= min_efficiency * rate * grad_rate * slope + grad_rate *= rate + history.used_grad[end] = true + history.backoff_steps[end] = backoff_steps + break + end + + rate *= backoff + end + + # [DEBUG] if we've hit a wall, quit + if history.backoff_steps[end] == max_backoff_steps + return L_last, history + end + end + + # return the factorization and its history + push!(history.scaled_loss, loss / scale_adjustment) + L, history +end + end \ No newline at end of file diff --git a/engine-proto/gram-test/circles-in-triangle.jl b/engine-proto/gram-test/circles-in-triangle.jl index b031711..2f6caa7 100644 --- a/engine-proto/gram-test/circles-in-triangle.jl +++ b/engine-proto/gram-test/circles-in-triangle.jl @@ -81,16 +81,23 @@ guess = hcat( =# # complete the gram matrix using gradient descent followed by Newton's method +#= L, history = Engine.realize_gram_gradient(gram, guess, scaled_tol = 0.01) L_pol, history_pol = Engine.realize_gram_newton(gram, L, rate = 0.3, scaled_tol = 1e-9) L_pol2, history_pol2 = Engine.realize_gram_newton(gram, L_pol) +=# +L, history = Engine.realize_gram(Float64.(gram), Float64.(guess)) completed_gram = L'*Engine.Q*L println("Completed Gram matrix:\n") display(completed_gram) +#= println( "\nSteps: ", size(history.scaled_loss, 1), " + ", size(history_pol.scaled_loss, 1), " + ", size(history_pol2.scaled_loss, 1) ) -println("Loss: ", history_pol2.scaled_loss[end], "\n") \ No newline at end of file +println("Loss: ", history_pol2.scaled_loss[end], "\n") +=# +println("\nSteps: ", size(history.scaled_loss, 1)) +println("Loss: ", history.scaled_loss[end], "\n") \ No newline at end of file diff --git a/engine-proto/gram-test/overlapping-pyramids.jl b/engine-proto/gram-test/overlapping-pyramids.jl index 51606e1..ae6fb88 100644 --- a/engine-proto/gram-test/overlapping-pyramids.jl +++ b/engine-proto/gram-test/overlapping-pyramids.jl @@ -47,17 +47,25 @@ guess = hcat( Engine.rand_on_shell(fill(BigFloat(-1), 2)) ) -# complete the gram matrix using gradient descent followed by Newton's method +# complete the gram matrix +#= L, history = Engine.realize_gram_gradient(gram, guess, scaled_tol = 0.01) L_pol, history_pol = Engine.realize_gram_newton(gram, L) +=# +L, history = Engine.realize_gram(Float64.(gram), Float64.(guess)) completed_gram = L'*Engine.Q*L println("Completed Gram matrix:\n") display(completed_gram) +#= println("\nSteps: ", size(history.scaled_loss, 1), " + ", size(history_pol.scaled_loss, 1)) println("Loss: ", history_pol.scaled_loss[end], "\n") +=# +println("\nSteps: ", size(history.scaled_loss, 1)) +println("Loss: ", history.scaled_loss[end], "\n") # === algebraic check === +#= R, gens = polynomial_ring(AbstractAlgebra.Rationals{BigInt}(), ["x", "t₁", "t₂", "t₃"]) x = gens[1] t = gens[2:4] @@ -85,3 +93,4 @@ x_constraint = 25//16 * to_univariate(S, evaluate(rank_constraints[1], [2], [ind t₂_constraint = 25//16 * to_univariate(S, evaluate(rank_constraints[3], [2], [indep_val])) x_vals = PolynomialRoots.roots(x_constraint.coeffs) t₂_vals = PolynomialRoots.roots(t₂_constraint.coeffs) +=# \ No newline at end of file diff --git a/engine-proto/gram-test/sphere-in-tetrahedron.jl b/engine-proto/gram-test/sphere-in-tetrahedron.jl index dc5588c..6b428cb 100644 --- a/engine-proto/gram-test/sphere-in-tetrahedron.jl +++ b/engine-proto/gram-test/sphere-in-tetrahedron.jl @@ -33,8 +33,11 @@ guess = sqrt(1/BigFloat(3)) * BigFloat[ 1 1 1 1 1 ] + 0.2*Engine.rand_on_shell(fill(BigFloat(-1), 5)) -# complete the gram matrix using Newton's method +# complete the gram matrix +#= L, history = Engine.realize_gram_newton(gram, guess) +=# +L, history = Engine.realize_gram(gram, guess, max_descent_steps = 50) completed_gram = L'*Engine.Q*L println("Completed Gram matrix:\n") display(completed_gram)