
    (^iA                    D   d Z ddlmZ ddlZddlmZ ddlmZ ddlZddlZ	ddl
mZ ddlmZ erddlmZ ddlZddlZndd	lmZ  ed
      Z ed      Z ee      ZddZ G d dej.                  j0                        Z G d d      Z	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy)a  Notations in this Gaussian process implementation

X_train: Observed parameter values with the shape of (len(trials), len(params)).
y_train: Observed objective values with the shape of (len(trials), ).
x: (Possibly batched) parameter value(s) to evaluate with the shape of (..., len(params)).
cov_fX_fX: Kernel matrix X = V[f(X)] with the shape of (len(trials), len(trials)).
cov_fx_fX: Kernel matrix Cov[f(x), f(X)] with the shape of (..., len(trials)).
cov_fx_fx: Kernel scalar value x = V[f(x)]. This value is constant for the Matern 5/2 kernel.
cov_Y_Y_inv:
    The inverse of the covariance matrix (V[f(X) + noise_var])^-1 with the shape of
    (len(trials), len(trials)).
cov_Y_Y_inv_Y: `cov_Y_Y_inv @ y` with the shape of (len(trials), ).
max_Y: The maximum of Y (Note that we transform the objective values such that it is maximized.)
sqd: The squared differences of each dimension between two points.
is_categorical:
    A boolean array with the shape of (len(params), ). If is_categorical[i] is True, the i-th
    parameter is categorical.
    )annotationsN)Any)TYPE_CHECKING)*single_blas_thread_if_scipy_v1_15_or_newer)
get_logger)Callable)_LazyImportscipytorchc                   t        j                  |       }t        j                  |      r| S t        j                  d       t        j
                  |d      }t        j                  | t        j                  |t        j                  t        j                  || t         j                        d      d      t        j                  |t        j                  t        j                  || t         j                         d      d            S )NzDClip non-finite values to the min/max finite values for GP fittings.r   )axis        )npisfiniteallwarningswarnanyclipwheremininfmax)valuesis_values_finiteis_any_finites      N/var/www/html/hubwallet-dev/venv/lib/python3.12/site-packages/optuna/_gp/gp.pywarn_and_convert_infr   /   s    {{6*	vvMMXYFF+!4M 77
rxx0@&"&&'QXY Z\_`
rxx0@&266''RYZ []`a     c                  0    e Zd Zedd       Zedd       Zy)Matern52Kernelc                    t        j                  d|z        }t        j                  |       }|d|z  |z   dz   z  }d|dz   z  |z  }| j                  |       |S )a  
        This method calculates `exp(-sqrt5d) * (1/3 * sqrt5d ** 2 + sqrt5d + 1)` where
        `sqrt5d = sqrt(5 * squared_distance)`.

        Please note that automatic differentiation by PyTorch does not work well at
        `squared_distance = 0` due to zero division, so we manually save the derivative, i.e.,
        `-5/6 * (1 + sqrt5d) * exp(-sqrt5d)`, for the exact derivative calculation.

        Notice that the derivative of this function is taken w.r.t. d**2, but not w.r.t. d.
           g?   g)r   sqrtexpsave_for_backward)ctxsquared_distancesqrt5dexp_partvalderivs         r   forwardzMatern52Kernel.forward@   sh     A 00199fW%5$44v=ABFQJ'(2e$
r   c                (    | j                   \  }||z  S )z
        Let x be squared_distance, f(x) be forward(ctx, x), and g(f) be a provided function, then
        deriv := df/dx, grad := dg/df, and deriv * grad = df/dx * dg/df = dg/dx.
        )saved_tensors)r(   gradr-   s      r   backwardzMatern52Kernel.backwardS   s     $$t|r   N)r(   r   r)   torch.Tensorreturnr3   )r(   r   r1   r3   r4   r3   )__name__
__module____qualname__staticmethodr.   r2    r   r   r!   r!   ?   s(     $  r   r!   c                      e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	dZed
d       ZddZ	 d	 	 	 	 	 ddZdddZddZ		 	 	 	 	 	 	 	 	 	 ddZ
y)GPRegressorc                   || _         || _        || _        |j                  d      |j                  d      z
  j	                         | _        | j                   j                         rT| j
                  d| j                   f   dkD  j                  t        j                        | j
                  d| j                   f<   d | _
        d | _        || _        || _        || _        y )N.r   )_is_categorical_X_train_y_train	unsqueezesquare__squared_X_diffr   typer   float64_cov_Y_Y_chol_cov_Y_Y_inv_Yinverse_squared_lengthscaleskernel_scale	noise_var)selfis_categoricalX_trainy_trainrI   rJ   rK   s          r   __init__zGPRegressor.__init__^   s      . ' 1 1" 58I8I"8M MVVX##%$$S$*>*>%>?#Ed5==!   d&:&:!:; 3737,H)("r   c                ~    dt        j                  | j                  j                         j	                               z  S )Ng      ?)r   r%   rI   detachnumpy)rL   s    r   length_scaleszGPRegressor.length_scalesv   s.    RWWT>>EEGMMOPPPr   c                   | j                   | j                  J d       t        j                         5  | j	                         j                         j                         }d d d        t        j                  | j                  j                  d         xx   | j                  j                         z  cc<   t        j                  j                  |      }t        j                  j!                  |j"                  t        j                  j!                  || j$                  j                         d      d      }t        j&                  |      | _         t        j&                  |      | _        | j(                  j                         | _        d | j(                  _        | j,                  j                         | _        d | j,                  _        | j                  j                         | _        d | j                  _        y # 1 sw Y   xY w)Nz(Cannot call cache_matrix more than once.r   T)lowerF)rG   rH   r   no_gradkernelrR   rS   r   diag_indicesr@   shaperK   itemlinalgcholeskyr
   solve_triangularTrA   
from_numpyrI   r1   rJ   )rL   cov_Y_Ycov_Y_Y_cholcov_Y_Y_inv_Ys       r   _cache_matrixzGPRegressor._cache_matrixz   s   &4+>+>+F	65	6F]]_ 	5kkm**,224G	5 	 3 3A 678DNN<O<O<QQ8yy))'2 55NNLL)),8K8K8MUY)Z 6 
 #--l;#..}=,0,M,M,T,T,V)15)). --446!%..0"+	5 	5s   -H  H
Nc                   ||J | j                   }n|| j                  }|j                  dk(  r||z
  n"|j                  d      |j                  d      z
  j	                         }| j
                  j                         r@|d| j
                  f   dkD  j                  t        j                        |d| j
                  f<   |j                  | j                        }t        j                  |      | j                  z  S )am  
        Return the kernel matrix with the shape of (..., n_A, n_B) given X1 and X2 each with the
        shapes of (..., n_A, len(params)) and (..., n_B, len(params)).

        If x1 and x2 have the shape of (len(params), ), kernel(x1, x2) is computed as:
            kernel_scale * Matern52Kernel.apply(
                sqd(x1, x2) @ inverse_squared_lengthscales
            )
        where if x1[i] is continuous, sqd(x1, x2)[i] = (x1[i] - x2[i]) ** 2 and if x1[i] is
        categorical, sqd(x1, x2)[i] = int(x1[i] != x2[i]).
        Note that the distance for categorical parameters is the Hamming distance.
        r$   r=   r>   .r   )rD   r@   ndimrB   rC   r?   r   rE   r   rF   matmulrI   r!   applyrJ   )rL   X1X2sqdsqdists        r   rX   zGPRegressor.kernel   s     :::&&Cz]] ggl27R0@2<<PRCS0S\\^C##'')25c4;O;O6O2PSV2V1\1\MM2C---. D==>##F+d.?.???r   c           	     Z   | j                   | j                  J d       |j                  dk(  }|s|n|j                  d      }t        j
                  j                  | j                  |      x}| j                        }t        j
                  j                  | j                   t        j
                  j                  | j                   j                  |dd      dd      }|rb|rJ d       | j                  ||      }||j                  |j                  dd	            z
  }	|	j                  d	d
      j                  d       n@| j                  }|t        j
                  j                  ||      z
  }	|	j                  d       |r"|j                  d      |	j                  d      fS ||	fS )a)  
        This method computes the posterior mean and variance given the points `x` where both mean
        and variance tensors will have the shape of x.shape[:-1].
        If ``joint=True``, the joint posterior will be computed.

        The posterior mean and variance are computed as:
            mean = cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ y, and
            var = cov_fx_fx - cov_fx_fX @ inv(cov_fX_fX + noise_var * I) @ cov_fx_fX.T.

        Please note that we clamp the variance to avoid negative values due to numerical errors.
        z+Call cache_matrix before calling posterior.r$   r   TF)upperleftz3Call posterior with joint=False for a single point.r=   )dim1dim2r   )rG   rH   rf   rB   r   r\   vecdotrX   r^   r_   rg   	transposediagonal
clamp_min_rJ   squeeze)
rL   xjointis_single_pointx_	cov_fx_fXmeanV	cov_fx_fxvar_s
             r   	posteriorzGPRegressor.posterior   s{    *t/B/B/N	98	9N&&A+%Q1;;q>||""B#?9ATATULL))LL))$*<*<*>*>	QU\a)b	 * 
 &](]]&B+Iqxx	(;(;B(CDDDMMrM+66s;))Iu||229a@@DOOC 5DQa1V4QU,Vr   c                >   | j                   j                  d   }d|z  t        j                  dt        j                  z        z  }| j                         | j                  t        j                  |t        j                        z  z   }t        j                  j                  |      }|j                         j                         j                          }t        j                  j                  || j                  dddf   d      dddf   }d||z  z  }||z   |z   S )a  
        This method computes the marginal log-likelihood of the kernel hyperparameters given the
        training dataset (X, y).
        Assume that N = len(X) in this method.

        Mathematically, the closed form is given as:
            -0.5 * log((2*pi)**N * det(C)) - 0.5 * y.T @ inv(C) @ y
            = -0.5 * log(det(C)) - 0.5 * y.T @ inv(C) @ y + const,
        where C = cov_Y_Y = cov_fX_fX + noise_var * I and inv(...) is the inverse operator.

        We exploit the full advantages of the Cholesky decomposition (C = L @ L.T) in this method:
            1. The determinant of a lower triangular matrix is the diagonal product, which can be
               computed with N flops where log(det(C)) = log(det(L.T @ L)) = 2 * log(det(L)).
            2. Solving linear system L @ u = y, which yields u = inv(L) @ y, costs N**2 flops.
        Note that given `u = inv(L) @ y` and `inv(C) = inv(L @ L.T) = inv(L).T @ inv(L)`,
        y.T @ inv(C) @ y is calculated as (inv(L) @ y) @ (inv(L) @ y).

        In principle, we could invert the matrix C first, but in this case, it costs:
            1. 1/3*N**3 flops for the determinant of inv(C).
            2. 2*N**2-N flops to solve C @ alpha = y, which is alpha = inv(C) @ y.

        Since the Cholesky decomposition costs 1/3*N**3 flops and the matrix inversion costs
        2/3*N**3 flops, the overall cost for the former is 1/3*N**3+N**2+N flops and that for the
        latter is N**3+2*N**2-N flops.
        r   g         dtypeNF)rn   )r@   rZ   mathlogpirX   rK   r   eyerF   r\   r]   ru   sumr^   rA   )rL   n_pointsconstra   Llogdet_partinv_L_y	quad_parts           r   marginal_log_likelihoodz#GPRegressor.marginal_log_likelihood   s    4 ==&&q)x$((1tww;"77++-$..599XU]]3["[[LL!!'*zz|'')--//,,//4==D3IQV/WXY[\X\]Gg-.	U"Y..r   c           	        	  j                   j                  d   	t        j                  t        j                   j
                  j                         j                               t        j                   j                  j                               t        j                   j                  j                         dz  z
        gg      }d
	 fd}t               5  t        j                  j                  ||ddd|i      }d d d        j                  st!        d|j"                         t%        j&                  |j(                        }t%        j*                  |d 	        _        t%        j*                  |	          _        r%t%        j,                  t$        j.                  	      nt%        j*                  |	dz            z    _
         j1                           S # 1 sw Y   xY w)Nr$   gGz?c                   t        j                  |       j                  d      }t        j                         5  t        j                  |d        _        t        j                  |         _        r%t        j                  t         j                        nt        j                  |dz            z   _	        j                                 z
  }|j                          |j                  dz      }r|dk(  sJ d d d        j                         |j                  j                         j                         fS # 1 sw Y   AxY w)NTr   r$   r   )r   r`   requires_grad_enable_gradr&   rI   rJ   tensorrF   rK   r   r2   r1   r[   rR   rS   )	
raw_paramsraw_params_tensorlossraw_noise_var_graddeterministic_objective	log_priorminimum_noisen_paramsrL   s	       r   	loss_funcz1GPRegressor._fit_kernel_params.<locals>.loss_func  s(    % 0 0 < K KD Q""$ N49II>OPYQY>Z4[1$)II.?.I$J! / LLemmD#4X\#BCmS 
 44664H%6%;%;HqL%I"26HA6MMMN 99; 1 6 6 = = ? E E GGGN Ns   C
EETzl-bfgs-bgtol)jacmethodoptionszOptimization failed: r   )r   
np.ndarrayr4   ztuple[float, np.ndarray])r@   rZ   r   concatenater   rI   rR   rS   rJ   r[   rK   r   r
   optimizeminimizesuccessRuntimeErrormessager   r`   rx   r&   r   rF   rd   )
rL   r   r   r   r   initial_raw_paramsr   resraw_params_opt_tensorr   s
   ````     @r   _fit_kernel_paramszGPRegressor._fit_kernel_params   s    ==&&q)  ^^t88??AGGIJFF4,,1134FF4>>..04-3GGH	
	H 	H" 89 		..))"! * C		 {{!6s{{mDEE % 0 0 7,1II6KIX6V,W)!II&;H&EF ' LLemm<+@A+N!OO 	
 	-		 		s   !'G))G2)rM   r3   rN   r3   rO   r3   rI   r3   rJ   r3   rK   r3   r4   None)r4   r   )r4   r   )NN)ri   torch.Tensor | Nonerj   r   r4   r3   )F)rx   r3   ry   boolr4   z!tuple[torch.Tensor, torch.Tensor])r4   r3   )
r   %Callable[[GPRegressor], torch.Tensor]r   floatr   r   r   r   r4   r;   )r5   r6   r7   rP   propertyrT   rd   rX   r   r   r   r9   r   r   r;   r;   ]   s    #$# # 	#
 '3# ##  # 
#0 Q Q#8 IM@%@2E@	@<#WJ!/F@8@ @ "&	@
 @ 
@r   r;   c           	     J    t        j                   j                  d   dz   t         j                        d	 fd} |       }	| |       }d }
||	fD ]  }	 t	        t        j
                        t        j
                         t        j
                        |j                  |j                  |j                        j                  ||||      c S  t        j                  d|
 d        |       }|j                          |S # t        $ r}|}
Y d }~d }~ww xY w)
Nr$   r   r   c            	         t        t        j                        t        j                         t        j                        d d j                         d   j                         d   j                               S )Nr=   rp   rM   rN   rO   rI   rJ   rK   )r;   r   r`   clone)XYdefault_kernel_paramsrM   s   r   _default_gprz'fit_kernel_params.<locals>._default_gprK  so     ++N;$$Q'$$Q')>s)C)I)I)K.r288:+B/557
 	
r   r   )r   r   r   r   z/The optimization of kernel parameters failed: 
z<
The default initial kernel parameters will be used instead.)r4   r;   )r   onesrZ   rF   r;   r`   rI   rJ   rK   r   r   loggerwarningrd   )r   r   rM   r   r   r   	gpr_cacher   r   default_gpr_cacheerrorgpr_cache_to_useedefault_gprr   s   ```           @r   fit_kernel_paramsr   >  s-    "JJqwwqzA~U]]K
 
 % N	E '(9: 	$//?((+((+-=-Z-Z-::*44 ! #+(?	 ! $ NN
:5' BF 	F .K  	E	s   A:D	D"DD")r   r   r4   r   )Ng{Gz?)r   r   r   r   rM   r   r   r   r   r   r   r   r   zGPRegressor | Noner   r   r4   r;   )__doc__
__future__r   r   typingr   r   r   rS   r   "optuna._gp.scipy_blas_thread_patchr   optuna.loggingr   collections.abcr   r
   r   optuna._importsr	   r5   r   r   autogradFunctionr!   r;   r   r9   r   r   <module>r      s   & #       Y % (+ E E	H	 U^^,, <^ ^P %)777 7 5	7
 7 "7 "7 7 7r   