forked from glen/dyna3

In previous commits, the `circles-in-triangle` example converged much more slowly in BigFloat precision than in Float64 precision. This turned out to be a sign of a bug in the Float64 computation: converting the Gram matrix using `Float64.()` dropped the explicit zeros, removing many constraints and making the problem much easier to solve. This commit corrects the Gram matrix conversion. The Float64 search now solves the same problem as the BigFloat search, with comparable performance.
418 lines
No EOL
12 KiB
Julia
418 lines
No EOL
12 KiB
Julia
module Engine
|
|
|
|
using LinearAlgebra
|
|
using GenericLinearAlgebra
|
|
using SparseArrays
|
|
using Random
|
|
using Optim
|
|
|
|
export
|
|
rand_on_shell, Q, DescentHistory,
|
|
realize_gram_gradient, realize_gram_newton, realize_gram_optim, realize_gram
|
|
|
|
# === guessing ===
|
|
|
|
sconh(t, u) = 0.5*(exp(t) + u*exp(-t))
|
|
|
|
function rand_on_sphere(rng::AbstractRNG, ::Type{T}, n) where T
|
|
out = randn(rng, T, n)
|
|
tries_left = 2
|
|
while dot(out, out) < 1e-6 && tries_left > 0
|
|
out = randn(rng, T, n)
|
|
tries_left -= 1
|
|
end
|
|
normalize(out)
|
|
end
|
|
|
|
##[TO DO] write a test to confirm that the outputs are on the correct shells
|
|
function rand_on_shell(rng::AbstractRNG, shell::T) where T <: Number
|
|
space_part = rand_on_sphere(rng, T, 4)
|
|
rapidity = randn(rng, T)
|
|
sig = sign(shell)
|
|
[sconh(rapidity, sig)*space_part; sconh(rapidity, -sig)]
|
|
end
|
|
|
|
rand_on_shell(rng::AbstractRNG, shells::Array{T}) where T <: Number =
|
|
hcat([rand_on_shell(rng, sh) for sh in shells]...)
|
|
|
|
rand_on_shell(shells::Array{<:Number}) = rand_on_shell(Random.default_rng(), shells)
|
|
|
|
# === elements ===
|
|
|
|
plane(normal, offset) = [normal; offset; offset]
|
|
|
|
function sphere(center, radius)
|
|
dist_sq = dot(center, center)
|
|
return [
|
|
center / radius;
|
|
0.5 * ((dist_sq - 1) / radius - radius);
|
|
0.5 * ((dist_sq + 1) / radius - radius)
|
|
]
|
|
end
|
|
|
|
# === Gram matrix realization ===
|
|
|
|
# the Lorentz form
|
|
Q = diagm([1, 1, 1, 1, -1])
|
|
|
|
# project a matrix onto the subspace of matrices whose entries vanish at the
|
|
# given indices
|
|
function proj_to_entries(mat, indices)
|
|
result = zeros(size(mat))
|
|
for (j, k) in indices
|
|
result[j, k] = mat[j, k]
|
|
end
|
|
result
|
|
end
|
|
|
|
# the difference between the matrices `target` and `attempt`, projected onto the
|
|
# subspace of matrices whose entries vanish at each empty index of `target`
|
|
function proj_diff(target::SparseMatrixCSC{T, <:Any}, attempt::Matrix{T}) where T
|
|
J, K, values = findnz(target)
|
|
result = zeros(size(target))
|
|
for (j, k, val) in zip(J, K, values)
|
|
result[j, k] = val - attempt[j, k]
|
|
end
|
|
result
|
|
end
|
|
|
|
# a type for keeping track of gradient descent history
|
|
struct DescentHistory{T}
|
|
scaled_loss::Array{T}
|
|
neg_grad::Array{Matrix{T}}
|
|
base_step::Array{Matrix{T}}
|
|
hess::Array{Hermitian{T, Matrix{T}}}
|
|
slope::Array{T}
|
|
stepsize::Array{T}
|
|
positive::Array{Bool}
|
|
backoff_steps::Array{Int64}
|
|
last_line_L::Array{Matrix{T}}
|
|
last_line_loss::Array{T}
|
|
|
|
function DescentHistory{T}(
|
|
scaled_loss = Array{T}(undef, 0),
|
|
neg_grad = Array{Matrix{T}}(undef, 0),
|
|
hess = Array{Hermitian{T, Matrix{T}}}(undef, 0),
|
|
base_step = Array{Matrix{T}}(undef, 0),
|
|
slope = Array{T}(undef, 0),
|
|
stepsize = Array{T}(undef, 0),
|
|
positive = Bool[],
|
|
backoff_steps = Int64[],
|
|
last_line_L = Array{Matrix{T}}(undef, 0),
|
|
last_line_loss = Array{T}(undef, 0)
|
|
) where T
|
|
new(scaled_loss, neg_grad, hess, base_step, slope, stepsize, positive, backoff_steps, last_line_L, last_line_loss)
|
|
end
|
|
end
|
|
|
|
# seek a matrix `L` for which `L'QL` matches the sparse matrix `gram` at every
|
|
# explicit entry of `gram`. use gradient descent starting from `guess`
|
|
function realize_gram_gradient(
|
|
gram::SparseMatrixCSC{T, <:Any},
|
|
guess::Matrix{T};
|
|
scaled_tol = 1e-30,
|
|
min_efficiency = 0.5,
|
|
init_stepsize = 1.0,
|
|
backoff = 0.9,
|
|
max_descent_steps = 600,
|
|
max_backoff_steps = 110
|
|
) where T <: Number
|
|
# start history
|
|
history = DescentHistory{T}()
|
|
|
|
# scale tolerance
|
|
scale_adjustment = sqrt(T(nnz(gram)))
|
|
tol = scale_adjustment * scaled_tol
|
|
|
|
# initialize variables
|
|
stepsize = init_stepsize
|
|
L = copy(guess)
|
|
|
|
# do gradient descent
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
loss = dot(Δ_proj, Δ_proj)
|
|
for _ in 1:max_descent_steps
|
|
# stop if the loss is tolerably low
|
|
if loss < tol
|
|
break
|
|
end
|
|
|
|
# find negative gradient of loss function
|
|
neg_grad = 4*Q*L*Δ_proj
|
|
slope = norm(neg_grad)
|
|
dir = neg_grad / slope
|
|
|
|
# store current position, loss, and slope
|
|
L_last = L
|
|
loss_last = loss
|
|
push!(history.scaled_loss, loss / scale_adjustment)
|
|
push!(history.neg_grad, neg_grad)
|
|
push!(history.slope, slope)
|
|
|
|
# find a good step size using backtracking line search
|
|
push!(history.stepsize, 0)
|
|
push!(history.backoff_steps, max_backoff_steps)
|
|
empty!(history.last_line_L)
|
|
empty!(history.last_line_loss)
|
|
for backoff_steps in 0:max_backoff_steps
|
|
history.stepsize[end] = stepsize
|
|
L = L_last + stepsize * dir
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
loss = dot(Δ_proj, Δ_proj)
|
|
improvement = loss_last - loss
|
|
push!(history.last_line_L, L)
|
|
push!(history.last_line_loss, loss / scale_adjustment)
|
|
if improvement >= min_efficiency * stepsize * slope
|
|
history.backoff_steps[end] = backoff_steps
|
|
break
|
|
end
|
|
stepsize *= backoff
|
|
end
|
|
|
|
# [DEBUG] if we've hit a wall, quit
|
|
if history.backoff_steps[end] == max_backoff_steps
|
|
break
|
|
end
|
|
end
|
|
|
|
# return the factorization and its history
|
|
push!(history.scaled_loss, loss / scale_adjustment)
|
|
L, history
|
|
end
|
|
|
|
function basis_matrix(::Type{T}, j, k, dims) where T
|
|
result = zeros(T, dims)
|
|
result[j, k] = one(T)
|
|
result
|
|
end
|
|
|
|
# seek a matrix `L` for which `L'QL` matches the sparse matrix `gram` at every
|
|
# explicit entry of `gram`. use Newton's method starting from `guess`
|
|
function realize_gram_newton(
|
|
gram::SparseMatrixCSC{T, <:Any},
|
|
guess::Matrix{T};
|
|
scaled_tol = 1e-30,
|
|
rate = 1,
|
|
max_steps = 100
|
|
) where T <: Number
|
|
# start history
|
|
history = DescentHistory{T}()
|
|
|
|
# find the dimension of the search space
|
|
dims = size(guess)
|
|
element_dim, construction_dim = dims
|
|
total_dim = element_dim * construction_dim
|
|
|
|
# list the constrained entries of the gram matrix
|
|
J, K, _ = findnz(gram)
|
|
constrained = zip(J, K)
|
|
|
|
# scale the tolerance
|
|
scale_adjustment = sqrt(T(length(constrained)))
|
|
tol = scale_adjustment * scaled_tol
|
|
|
|
# use Newton's method
|
|
L = copy(guess)
|
|
for step in 0:max_steps
|
|
# evaluate the loss function
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
loss = dot(Δ_proj, Δ_proj)
|
|
|
|
# store the current loss
|
|
push!(history.scaled_loss, loss / scale_adjustment)
|
|
|
|
# stop if the loss is tolerably low
|
|
if loss < tol || step > max_steps
|
|
break
|
|
end
|
|
|
|
# find the negative gradient of loss function
|
|
neg_grad = 4*Q*L*Δ_proj
|
|
|
|
# find the negative Hessian of the loss function
|
|
hess = Matrix{T}(undef, total_dim, total_dim)
|
|
indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim]
|
|
for (j, k) in indices
|
|
basis_mat = basis_matrix(T, j, k, dims)
|
|
neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat
|
|
neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained)
|
|
deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj)
|
|
hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
|
|
end
|
|
hess = Hermitian(hess)
|
|
push!(history.hess, hess)
|
|
|
|
# compute the Newton step
|
|
step = hess \ reshape(neg_grad, total_dim)
|
|
L += rate * reshape(step, dims)
|
|
end
|
|
|
|
# return the factorization and its history
|
|
L, history
|
|
end
|
|
|
|
LinearAlgebra.eigen!(A::Symmetric{BigFloat, Matrix{BigFloat}}; sortby::Nothing) =
|
|
eigen!(Hermitian(A))
|
|
|
|
function convertnz(type, mat)
|
|
J, K, values = findnz(mat)
|
|
sparse(J, K, type.(values))
|
|
end
|
|
|
|
function realize_gram_optim(
|
|
gram::SparseMatrixCSC{T, <:Any},
|
|
guess::Matrix{T}
|
|
) where T <: Number
|
|
# find the dimension of the search space
|
|
dims = size(guess)
|
|
element_dim, construction_dim = dims
|
|
total_dim = element_dim * construction_dim
|
|
|
|
# list the constrained entries of the gram matrix
|
|
J, K, _ = findnz(gram)
|
|
constrained = zip(J, K)
|
|
|
|
# scale the loss function
|
|
scale_adjustment = length(constrained)
|
|
|
|
function loss(L_vec)
|
|
L = reshape(L_vec, dims)
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
dot(Δ_proj, Δ_proj) / scale_adjustment
|
|
end
|
|
|
|
function loss_grad!(storage, L_vec)
|
|
L = reshape(L_vec, dims)
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
storage .= reshape(-4*Q*L*Δ_proj, total_dim) / scale_adjustment
|
|
end
|
|
|
|
function loss_hess!(storage, L_vec)
|
|
L = reshape(L_vec, dims)
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim]
|
|
for (j, k) in indices
|
|
basis_mat = basis_matrix(T, j, k, dims)
|
|
neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat
|
|
neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained)
|
|
deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj) / scale_adjustment
|
|
storage[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
|
|
end
|
|
end
|
|
|
|
optimize(
|
|
loss, loss_grad!, loss_hess!,
|
|
reshape(guess, total_dim),
|
|
NewtonTrustRegion()
|
|
)
|
|
end
|
|
|
|
# seek a matrix `L` for which `L'QL` matches the sparse matrix `gram` at every
|
|
# explicit entry of `gram`. use gradient descent starting from `guess`
|
|
function realize_gram(
|
|
gram::SparseMatrixCSC{T, <:Any},
|
|
guess::Matrix{T};
|
|
scaled_tol = 1e-16,
|
|
min_efficiency = 0.5,
|
|
init_rate = 1.0,
|
|
backoff = 0.9,
|
|
reg_scale = 1.1,
|
|
max_descent_steps = 200,
|
|
max_backoff_steps = 110
|
|
) where T <: Number
|
|
# start history
|
|
history = DescentHistory{T}()
|
|
|
|
# find the dimension of the search space
|
|
dims = size(guess)
|
|
element_dim, construction_dim = dims
|
|
total_dim = element_dim * construction_dim
|
|
|
|
# list the constrained entries of the gram matrix
|
|
J, K, _ = findnz(gram)
|
|
constrained = zip(J, K)
|
|
|
|
# scale the tolerance
|
|
scale_adjustment = sqrt(T(length(constrained)))
|
|
tol = scale_adjustment * scaled_tol
|
|
|
|
# initialize variables
|
|
grad_rate = init_rate
|
|
L = copy(guess)
|
|
|
|
# use Newton's method with backtracking and gradient descent backup
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
loss = dot(Δ_proj, Δ_proj)
|
|
for step in 1:max_descent_steps
|
|
# stop if the loss is tolerably low
|
|
if loss < tol
|
|
break
|
|
end
|
|
|
|
# find the negative gradient of loss function
|
|
neg_grad = 4*Q*L*Δ_proj
|
|
|
|
# find the negative Hessian of the loss function
|
|
hess = Matrix{T}(undef, total_dim, total_dim)
|
|
indices = [(j, k) for k in 1:construction_dim for j in 1:element_dim]
|
|
for (j, k) in indices
|
|
basis_mat = basis_matrix(T, j, k, dims)
|
|
neg_dΔ = basis_mat'*Q*L + L'*Q*basis_mat
|
|
neg_dΔ_proj = proj_to_entries(neg_dΔ, constrained)
|
|
deriv_grad = 4*Q*(-basis_mat*Δ_proj + L*neg_dΔ_proj)
|
|
hess[:, (k-1)*element_dim + j] = reshape(deriv_grad, total_dim)
|
|
end
|
|
hess = Hermitian(hess)
|
|
push!(history.hess, hess)
|
|
|
|
# regularize the Hessian
|
|
min_eigval = minimum(eigvals(hess))
|
|
push!(history.positive, min_eigval > 0)
|
|
if min_eigval <= 0
|
|
hess -= reg_scale * min_eigval * I
|
|
end
|
|
base_step = reshape(hess \ reshape(neg_grad, total_dim), dims)
|
|
push!(history.base_step, base_step)
|
|
|
|
# store the current position, loss, and slope
|
|
L_last = L
|
|
loss_last = loss
|
|
push!(history.scaled_loss, loss / scale_adjustment)
|
|
push!(history.neg_grad, neg_grad)
|
|
push!(history.slope, norm(neg_grad))
|
|
|
|
# find a good step size using backtracking line search
|
|
push!(history.stepsize, 0)
|
|
push!(history.backoff_steps, max_backoff_steps)
|
|
empty!(history.last_line_L)
|
|
empty!(history.last_line_loss)
|
|
rate = one(T)
|
|
step_success = false
|
|
for backoff_steps in 0:max_backoff_steps
|
|
history.stepsize[end] = rate
|
|
L = L_last + rate * base_step
|
|
Δ_proj = proj_diff(gram, L'*Q*L)
|
|
loss = dot(Δ_proj, Δ_proj)
|
|
improvement = loss_last - loss
|
|
push!(history.last_line_L, L)
|
|
push!(history.last_line_loss, loss / scale_adjustment)
|
|
if improvement >= min_efficiency * rate * dot(neg_grad, base_step)
|
|
history.backoff_steps[end] = backoff_steps
|
|
step_success = true
|
|
break
|
|
end
|
|
rate *= backoff
|
|
end
|
|
|
|
# if we've hit a wall, quit
|
|
if !step_success
|
|
return L_last, false, history
|
|
end
|
|
end
|
|
|
|
# return the factorization and its history
|
|
push!(history.scaled_loss, loss / scale_adjustment)
|
|
L, true, history
|
|
end
|
|
|
|
end |