File Coverage

levmar-2.5/lm_core.c

Criterion	Covered	Total	%
statement	268	316	84.8
branch	359	440	81.5
condition			n/a
subroutine			n/a
pod			n/a
total	627	756	82.9

line	stmt	bran	code
1			/////////////////////////////////////////////////////////////////////////////////
2			//
3			// Levenberg - Marquardt non-linear minimization algorithm
4			// Copyright (C) 2004 Manolis Lourakis (lourakis at ics forth gr)
5			// Institute of Computer Science, Foundation for Research & Technology - Hellas
6			// Heraklion, Crete, Greece.
7			//
8			// This program is free software; you can redistribute it and/or modify
9			// it under the terms of the GNU General Public License as published by
10			// the Free Software Foundation; either version 2 of the License, or
11			// (at your option) any later version.
12			//
13			// This program is distributed in the hope that it will be useful,
14			// but WITHOUT ANY WARRANTY; without even the implied warranty of
15			// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16			// GNU General Public License for more details.
17			//
18			/////////////////////////////////////////////////////////////////////////////////
19
20			#ifndef LM_REAL // not included by lm.c
21			#error This file should not be compiled directly!
22			#endif
23
24
25			/* precision-specific definitions */
26			#define LEVMAR_DER LM_ADD_PREFIX(levmar_der)
27			#define LEVMAR_DIF LM_ADD_PREFIX(levmar_dif)
28			#define LEVMAR_FDIF_FORW_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_forw_jac_approx)
29			#define LEVMAR_FDIF_CENT_JAC_APPROX LM_ADD_PREFIX(levmar_fdif_cent_jac_approx)
30			#define LEVMAR_TRANS_MAT_MAT_MULT LM_ADD_PREFIX(levmar_trans_mat_mat_mult)
31			#define LEVMAR_L2NRMXMY LM_ADD_PREFIX(levmar_L2nrmxmy)
32			#define LEVMAR_COVAR LM_ADD_PREFIX(levmar_covar)
33
34			#ifdef HAVE_LAPACK
35			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU)
36			#define AX_EQ_B_CHOL LM_ADD_PREFIX(Ax_eq_b_Chol)
37			#define AX_EQ_B_QR LM_ADD_PREFIX(Ax_eq_b_QR)
38			#define AX_EQ_B_QRLS LM_ADD_PREFIX(Ax_eq_b_QRLS)
39			#define AX_EQ_B_SVD LM_ADD_PREFIX(Ax_eq_b_SVD)
40			#define AX_EQ_B_BK LM_ADD_PREFIX(Ax_eq_b_BK)
41			#else
42			#define AX_EQ_B_LU LM_ADD_PREFIX(Ax_eq_b_LU_noLapack)
43			#endif /* HAVE_LAPACK */
44
45			/*
46			* This function seeks the parameter vector p that best describes the measurements vector x.
47			* More precisely, given a vector function func : R^m --> R^n with n>=m,
48			* it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of
49			* e=x-func(p) is minimized.
50			*
51			* This function requires an analytic Jacobian. In case the latter is unavailable,
52			* use LEVMAR_DIF() bellow
53			*
54			* Returns the number of iterations (>=0) if successful, LM_ERROR if failed
55			*
56			* For more details, see K. Madsen, H.B. Nielsen and O. Tingleff's lecture notes on
57			* non-linear least squares at http://www.imm.dtu.dk/pubdb/views/edoc_download.php/3215/pdf/imm3215.pdf
58			*/
59
60	125		int LEVMAR_DER(
61			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
62			void (jacf)(LM_REAL p, LM_REAL j, int m, int n, void adata), /* function to evaluate the Jacobian \part x / \part p */
63			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
64			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
65			int m, /* I: parameter vector dimension (i.e. #unknowns) */
66			int n, /* I: measurement vector dimension */
67			int itmax, /* I: maximum number of iterations */
68			LM_REAL opts[4], /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
69			* stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2. Set to NULL for defaults to be used
70			*/
71			LM_REAL info[LM_INFO_SZ],
72			/* O: information regarding the minimization. Set to NULL if don't care
73			* info[0]= \|\|e\|\|_2 at initial p.
74			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
75			* info[5]= # iterations,
76			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
77			* 2 - stopped by small Dp
78			* 3 - stopped by itmax
79			* 4 - singular matrix. Restart from current p with increased mu
80			* 5 - no further error reduction is possible. Restart with increased mu
81			* 6 - stopped by small \|\|e\|\|_2
82			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
83			* info[7]= # function evaluations
84			* info[8]= # Jacobian evaluations
85			* info[9]= # linear systems solved, i.e. # attempts for reducing error
86			*/
87			LM_REAL work, / working memory at least LM_DER_WORKSZ() reals large, allocated if NULL */
88			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
89			void adata) / pointer to possibly additional data, passed uninterpreted to func & jacf.
90			* Set to NULL if not needed
91			*/
92			{
93			register int i, j, k, l;
94
95
96	387	100	for(j=0; j < m; ++j) {
		100
97			// fprintf(stderr,"lm_core: p[%d]=%e\n",j,p[j]);
98			}
99
100	195055	100	for(j=0; j
		100
101			// fprintf(stderr,"lm_core: x[%d]=%e\n",j,x[j]);
102			}
103
104
105
106	125		int worksz, freework=0, issolved;
107			/* temp work arrays */
108			LM_REAL e, / nx1 */
109			hx, / \hat{x}_i, nx1 */
110			jacTe, / J^T e_i mx1 */
111			jac, / nxm */
112			jacTjac, / mxm */
113			Dp, / mx1 */
114			diag_jacTjac, / diagonal of J^T J, mx1 */
115			pDp; / p + Dp, mx1 */
116
117			register LM_REAL mu, /* damping constant */
118			tmp; /* mainly used in matrix & vector multiplications */
119			LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* \|\|e(p)\|\|_2, \|\|J^T e\|\|_inf, \|\|e(p+Dp)\|\|_2 */
120	125		LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
121			LM_REAL tau, eps1, eps2, eps2_sq, eps3;
122			LM_REAL init_p_eL2;
123	125		int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;
124	125		const int nm=n*m;
125	125		int (linsolver)(LM_REAL A, LM_REAL B, LM_REAL x, int m)=NULL;
126
127	125		mu=jacTe_inf=0.0; /* -Wall */
128
129	125	50	if(n
		50
130	0		fprintf(stderr, LCAT(LEVMAR_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m);
131	0		return LM_ERROR;
132			}
133
134	125	50	if(!jacf){
		50
135	0		fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_DER)
136			RCAT("().\nIf no such function is available, use ", LEVMAR_DIF) RCAT("() rather than ", LEVMAR_DER) "()\n");
137	0		return LM_ERROR;
138			}
139
140	125	50	if(opts){
		50
141	125		tau=opts[0];
142	125		eps1=opts[1];
143	125		eps2=opts[2];
144	125		eps2_sq=opts[2]*opts[2];
145	125		eps3=opts[3];
146			}
147			else{ // use default values
148	0		tau=LM_CNST(LM_INIT_MU);
149	0		eps1=LM_CNST(LM_STOP_THRESH);
150	0		eps2=LM_CNST(LM_STOP_THRESH);
151	0		eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH);
152	0		eps3=LM_CNST(LM_STOP_THRESH);
153			}
154
155	125	50	if(!work){
		50
156	0		worksz=LM_DER_WORKSZ(m, n); //2n+4m + nm + mm;
157	0		work=(LM_REAL )malloc(workszsizeof(LM_REAL)); /* allocate a big chunk in one step */
158	0	0	if(!work){
		0
159	0		fprintf(stderr, LCAT(LEVMAR_DER, "(): memory allocation request failed\n"));
160	0		return LM_ERROR;
161			}
162	0		freework=1;
163			}
164
165			/* set up work arrays */
166	125		e=work;
167	125		hx=e + n;
168	125		jacTe=hx + n;
169	125		jac=jacTe + m;
170	125		jacTjac=jac + nm;
171	125		Dp=jacTjac + m*m;
172	125		diag_jacTjac=Dp + m;
173	125		pDp=diag_jacTjac + m;
174
175			/* compute e=x - f(p) and its L2 norm */
176	125		(*func)(p, hx, m, n, adata); nfev=1;
177			/* ### e=x-hx, p_eL2=\|\|e\|\| */
178			#if 1
179	125		p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);
180			#else
181			for(i=0, p_eL2=0.0; i
182			e[i]=tmp=x[i]-hx[i];
183			p_eL2+=tmp*tmp;
184			}
185			#endif
186	125		init_p_eL2=p_eL2;
187	125	50	if(!LM_FINITE(p_eL2)) stop=7;
		50
188
189	31012	100	for(k=0; k
		100
		100
		100
190			/* Note that p and e have been updated at a previous iteration */
191
192	30987	100	if(p_eL2<=eps3){ /* error is small */
		100
193	100		stop=6;
194	100		break;
195			}
196
197			/* Compute the Jacobian J at p, J^T J, J^T e, \|\|J^T e\|\|_inf and \|\|p\|\|^2.
198			* Since J^T J is symmetric, its computation can be sped up by computing
199			* only its upper triangular part and copying it to the lower part
200			*/
201
202	30887		(*jacf)(p, jac, m, n, adata); ++njev;
203
204			/* J^T J, J^T e */
205	30887		if(nm<__BLOCKSZ__SQ){ // this is a small problem
206			/* J^TJ_ij = \sum_l J^T_il J_lj = \sum_l J_li * J_lj.
207			* Thus, the product J^T J can be computed using an outer loop for
208			* l that adds J_li*J_lj to each element ij of the result. Note that
209			* with this scheme, the accesses to J and JtJ are always along rows,
210			* therefore induces less cache misses compared to the straightforward
211			* algorithm for computing the product (i.e., l loop is innermost one).
212			* A similar scheme applies to the computation of J^T e.
213			* However, for large minimization problems (i.e., involving a large number
214			* of unknowns and measurements) for which J/J^T J rows are too large to
215			* fit in the L1 cache, even this scheme incures many cache misses. In
216			* such cases, a cache-efficient blocking scheme is preferable.
217			*
218			* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
219			* performance problem.
220			*
221			* Note that the non-blocking algorithm is faster on small
222			* problems since in this case it avoids the overheads of blocking.
223			*/
224
225			/* looping downwards saves a few computations */
226			register int l, im;
227			register LM_REAL alpha, *jaclm;
228
229	151340	100	for(i=m*m; i-->0; )
		100
230	121072		jacTjac[i]=0.0;
231	90804	100	for(i=m; i-->0; )
		100
232	60536		jacTe[i]=0.0;
233
234	109502	100	for(l=n; l-->0; ){
		100
235	79234		jaclm=jac+l*m;
236	237702	100	for(i=m; i-->0; ){
		100
237	158468		im=i*m;
238	158468		alpha=jaclm[i]; //jac[l*m+i];
239	396170	100	for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
		100
240	237702		jacTjac[im+j]+=jaclm[j]alpha; //jac[lm+j]
241
242			/* J^T e */
243	158468		jacTe[i]+=alpha*e[l];
244			}
245			}
246
247	90804	100	for(i=m; i-->0; ) /* copy to upper part */
		100
248	90804	100	for(j=i+1; j
		100
249	30268		jacTjac[im+j]=jacTjac[jm+i];
250
251			}
252			else{ // this is a large problem
253			/* Cache efficient computation of J^T J based on blocking
254			*/
255	619		LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m);
256
257			/* cache efficient computation of J^T e */
258	1917	100	for(i=0; i
		0
259	1298		jacTe[i]=0.0;
260
261	1573619	100	for(i=0; i
		0
262			register LM_REAL *jacrow;
263
264	4779000	100	for(l=0, jacrow=jac+i*m, tmp=e[i]; l
		0
265	3206000		jacTe[l]+=jacrow[l]*tmp;
266			}
267			}
268
269			/* Compute \|\|J^T e\|\|_inf and \|\|p\|\|^2 */
270	92721	100	for(i=0, p_L2=jacTe_inf=0.0; i
		100
271	61834	100	if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
		100
		100
		100
272
273	61834		diag_jacTjac[i]=jacTjac[im+i]; / save diagonal entries so that augmentation can be later canceled */
274	61834		p_L2+=p[i]*p[i];
275			}
276			//p_L2=sqrt(p_L2);
277
278			#if 0
279			if(!(k%100)){
280			printf("Current estimate: ");
281			for(i=0; i
282			printf("%.9g ", p[i]);
283			printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
284			}
285			#endif
286
287			/* check for convergence */
288	30887	50	if((jacTe_inf <= eps1)){
		50
289	0		Dp_L2=0.0; /* no increment for p in this case */
290	0		stop=1;
291	0		break;
292			}
293
294			/* compute initial damping factor */
295	30887	100	if(k==0){
		100
296	387	100	for(i=0, tmp=LM_REAL_MIN; i
		100
297	262	100	if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
		100
298	125		mu=tau*tmp;
299			}
300
301			/* determine increment using adaptive damping */
302			while(1){
303			/* augment normal equations */
304	121284	100	for(i=0; i
		100
305	80876		jacTjac[i*m+i]+=mu;
306
307			/* solve augmented equations */
308			#ifdef HAVE_LAPACK
309			/* 6 alternatives are available: LU, Cholesky, 2 variants of QR decomposition, SVD and LDLt.
310			* Cholesky is the fastest but might be inaccurate; QR is slower but more accurate;
311			* SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
312			*/
313
314			issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK;
315			//issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
316			//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
317			//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
318			//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int ()(LM_REAL A, LM_REAL B, LM_REAL x, int m))AX_EQ_B_QRLS;
319			//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
320
321			#else
322			/* use the LU included with levmar */
323	40408		issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
324			#endif /* HAVE_LAPACK */
325
326	40408		if(issolved){
327			/* compute p's new estimate and \|\|Dp\|\|^2 */
328	121284	100	for(i=0, Dp_L2=0.0; i
		100
329	80876		pDp[i]=p[i] + (tmp=Dp[i]);
330	80876		Dp_L2+=tmp*tmp;
331			}
332			//Dp_L2=sqrt(Dp_L2);
333
334	40408	100	if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
		100
335			//if(Dp_L2<=eps2(p_L2 + eps2)){ / relative change in p is small, stop */
336	6		stop=2;
337	6		break;
338			}
339
340	40402	50	if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)LM_CNST(EPSILON))){ / almost singular */
		50
341			//if(Dp_L2>=(p_L2+eps2)/LM_CNST(EPSILON)){ /* almost singular */
342	0		stop=4;
343	0		break;
344			}
345
346	40402		(func)(pDp, hx, m, n, adata); ++nfev; / evaluate function at p + Dp */
347			/* compute \|\|e(pDp)\|\|_2 */
348			/* ### hx=x-hx, pDp_eL2=\|\|hx\|\| */
349			#if 1
350	40402		pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);
351			#else
352			for(i=0, pDp_eL2=0.0; i
353			hx[i]=tmp=x[i]-hx[i];
354			pDp_eL2+=tmp*tmp;
355			}
356			#endif
357	40402		if(!LM_FINITE(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
358			* This check makes sure that the inner loop does not run indefinitely.
359			* Thanks to Steve Danauskas for reporting such cases
360			*/
361	12		stop=7;
362	12		break;
363			}
364
365	121230	100	for(i=0, dL=0.0; i
		100
366	80840		dL+=Dp[i](muDp[i]+jacTe[i]);
367
368	40390		dF=p_eL2-pDp_eL2;
369
370	40390	50	if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
		100
		50
		100
371	30869		tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0));
372	30869		tmp=LM_CNST(1.0)-tmptmptmp;
373	30869	100	mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) );
		100
374	30869		nu=2;
375
376	92667	100	for(i=0 ; i
		100
377	61798		p[i]=pDp[i];
378
379	1671085	100	for(i=0; i
		100
380	1640216		e[i]=hx[i];
381	30869		p_eL2=pDp_eL2;
382	30869		break;
383			}
384			}
385
386			/* if this point is reached, either the linear system could not be solved or
387			* the error did not reduce; in any case, the increment must be rejected
388			*/
389
390	9521		mu*=nu;
391	9521		nu2=nu<<1; // 2*nu;
392	9521	50	if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
		50
393	0		stop=5;
394	0		break;
395			}
396	9521		nu=nu2;
397
398	28563	100	for(i=0; i
		100
399	19042		jacTjac[i*m+i]=diag_jacTjac[i];
400	9521		} /* inner loop */
401			}
402
403	125	100	if(k>=itmax) stop=3;
		100
404
405	387	100	for(i=0; i
		100
406	262		jacTjac[i*m+i]=diag_jacTjac[i];
407
408	125	50	if(info){
		50
409	125		info[0]=init_p_eL2;
410	125		info[1]=p_eL2;
411	125		info[2]=jacTe_inf;
412	125		info[3]=Dp_L2;
413	387	100	for(i=0, tmp=LM_REAL_MIN; i
		100
414	262	100	if(tmp
		100
415	125		info[4]=mu/tmp;
416	125		info[5]=(LM_REAL)k;
417	125		info[6]=(LM_REAL)stop;
418	125		info[7]=(LM_REAL)nfev;
419	125		info[8]=(LM_REAL)njev;
420	125		info[9]=(LM_REAL)nlss;
421			}
422
423			// fprintf(stderr,"lm_core: before freeing\n");
424	387	100	for(i=0; i
		100
425			// fprintf(stderr,"lm_core: after fit p[%d]=%e\n",i,p[i]);
426			}
427
428			/* covariance matrix */
429	125	50	if(covar){
		50
430	125		LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
431			}
432
433	125	50	if(freework) free(work);
		50
434
435			#ifdef LINSOLVERS_RETAIN_MEMORY
436	125	50	if(linsolver) (*linsolver)(NULL, NULL, NULL, 0);
		50
437			#endif
438
439	125	50	return (stop!=4 && stop!=7)? k : LM_ERROR;
		100
		50
		50
440			}
441
442
443			/* Secant version of the LEVMAR_DER() function above: the Jacobian is approximated with
444			* the aid of finite differences (forward or central, see the comment for the opts argument)
445			*/
446	18		int LEVMAR_DIF(
447			void (func)(LM_REAL p, LM_REAL hx, int m, int n, void adata), /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in R^n */
448			LM_REAL p, / I/O: initial parameter estimates. On output has the estimated solution */
449			LM_REAL x, / I: measurement vector. NULL implies a zero vector */
450			int m, /* I: parameter vector dimension (i.e. #unknowns) */
451			int n, /* I: measurement vector dimension */
452			int itmax, /* I: maximum number of iterations */
453			LM_REAL opts[5], /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
454			* scale factor for initial \mu, stopping thresholds for \|\|J^T e\|\|_inf, \|\|Dp\|\|_2 and \|\|e\|\|_2 and
455			* the step used in difference approximation to the Jacobian. Set to NULL for defaults to be used.
456			* If \delta<0, the Jacobian is approximated with central differences which are more accurate
457			* (but slower!) compared to the forward differences employed by default.
458			*/
459			LM_REAL info[LM_INFO_SZ],
460			/* O: information regarding the minimization. Set to NULL if don't care
461			* info[0]= \|\|e\|\|_2 at initial p.
462			* info[1-4]=[ \|\|e\|\|_2, \|\|J^T e\|\|_inf, \|\|Dp\|\|_2, mu/max[J^T J]_ii ], all computed at estimated p.
463			* info[5]= # iterations,
464			* info[6]=reason for terminating: 1 - stopped by small gradient J^T e
465			* 2 - stopped by small Dp
466			* 3 - stopped by itmax
467			* 4 - singular matrix. Restart from current p with increased mu
468			* 5 - no further error reduction is possible. Restart with increased mu
469			* 6 - stopped by small \|\|e\|\|_2
470			* 7 - stopped by invalid (i.e. NaN or Inf) "func" values. This is a user error
471			* info[7]= # function evaluations
472			* info[8]= # Jacobian evaluations
473			* info[9]= # linear systems solved, i.e. # attempts for reducing error
474			*/
475			LM_REAL work, / working memory at least LM_DIF_WORKSZ() reals large, allocated if NULL */
476			LM_REAL covar, / O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
477			void adata) / pointer to possibly additional data, passed uninterpreted to func.
478			* Set to NULL if not needed
479			*/
480			{
481			register int i, j, k, l;
482	18		int worksz, freework=0, issolved;
483			/* temp work arrays */
484			LM_REAL e, / nx1 */
485			hx, / \hat{x}_i, nx1 */
486			jacTe, / J^T e_i mx1 */
487			jac, / nxm */
488			jacTjac, / mxm */
489			Dp, / mx1 */
490			diag_jacTjac, / diagonal of J^T J, mx1 */
491			pDp, / p + Dp, mx1 */
492			wrk, / nx1 */
493			wrk2; / nx1, used only for holding a temporary e vector and when differentiating with central differences */
494
495	18		int using_ffdif=1;
496
497			register LM_REAL mu, /* damping constant */
498			tmp; /* mainly used in matrix & vector multiplications */
499			LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* \|\|e(p)\|\|_2, \|\|J^T e\|\|_inf, \|\|e(p+Dp)\|\|_2 */
500	18		LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
501			LM_REAL tau, eps1, eps2, eps2_sq, eps3, delta;
502			LM_REAL init_p_eL2;
503	18		int nu, nu2, stop=0, nfev, njap=0, nlss=0, K=(m>=10)? m: 10, updjac, updp=1, newjac;
504	18		const int nm=n*m;
505	18		int (linsolver)(LM_REAL A, LM_REAL B, LM_REAL x, int m)=NULL;
506
507	18		mu=jacTe_inf=p_L2=0.0; /* -Wall */
508	18		updjac=newjac=0; /* -Wall */
509
510	18	50	if(n
		50
511	0		fprintf(stderr, LCAT(LEVMAR_DIF, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m);
512	0		return LM_ERROR;
513			}
514
515	18	50	if(opts){
		50
516	18		tau=opts[0];
517	18		eps1=opts[1];
518	18		eps2=opts[2];
519	18		eps2_sq=opts[2]*opts[2];
520	18		eps3=opts[3];
521	18		delta=opts[4];
522	18	50	if(delta<0.0){
		50
523	0		delta=-delta; /* make positive */
524	18		using_ffdif=0; /* use central differencing */
525			}
526			}
527			else{ // use default values
528	0		tau=LM_CNST(LM_INIT_MU);
529	0		eps1=LM_CNST(LM_STOP_THRESH);
530	0		eps2=LM_CNST(LM_STOP_THRESH);
531	0		eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH);
532	0		eps3=LM_CNST(LM_STOP_THRESH);
533	0		delta=LM_CNST(LM_DIFF_DELTA);
534			}
535
536	18	50	if(!work){
		50
537	0		worksz=LM_DIF_WORKSZ(m, n); //4n+4m + nm + mm;
538	0		work=(LM_REAL )malloc(workszsizeof(LM_REAL)); /* allocate a big chunk in one step */
539	0	0	if(!work){
		0
540	0		fprintf(stderr, LCAT(LEVMAR_DIF, "(): memory allocation request failed\n"));
541	0		return LM_ERROR;
542			}
543	0		freework=1;
544			}
545
546			/* set up work arrays */
547	18		e=work;
548	18		hx=e + n;
549	18		jacTe=hx + n;
550	18		jac=jacTe + m;
551	18		jacTjac=jac + nm;
552	18		Dp=jacTjac + m*m;
553	18		diag_jacTjac=Dp + m;
554	18		pDp=diag_jacTjac + m;
555	18		wrk=pDp + m;
556	18		wrk2=wrk + n;
557
558			/* compute e=x - f(p) and its L2 norm */
559	18		(*func)(p, hx, m, n, adata); nfev=1;
560			/* ### e=x-hx, p_eL2=\|\|e\|\| */
561			#if 1
562	18		p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);
563			#else
564			for(i=0, p_eL2=0.0; i
565			e[i]=tmp=x[i]-hx[i];
566			p_eL2+=tmp*tmp;
567			}
568			#endif
569	18		init_p_eL2=p_eL2;
570	18	50	if(!LM_FINITE(p_eL2)) stop=7;
		50
571
572	18		nu=20; /* force computation of J */
573
574	1336	50	for(k=0; k
		50
		50
		50
575			/* Note that p and e have been updated at a previous iteration */
576
577	1336	100	if(p_eL2<=eps3){ /* error is small */
		100
578	13		stop=6;
579	13		break;
580			}
581
582			/* Compute the Jacobian J at p, J^T J, J^T e, \|\|J^T e\|\|_inf and \|\|p\|\|^2.
583			* The symmetry of J^T J is again exploited for speed
584			*/
585
586	1323	100	if((updp && nu>16) \|\| updjac==K){ /* compute difference approximation to J */
		100
		100
		100
		100
		100
587	138	50	if(using_ffdif){ /* use forward differences */
		50
588	138		LEVMAR_FDIF_FORW_JAC_APPROX(func, p, hx, wrk, delta, jac, m, n, adata);
589	138		++njap; nfev+=m;
590			}
591			else{ /* use central differences */
592	0		LEVMAR_FDIF_CENT_JAC_APPROX(func, p, wrk, wrk2, delta, jac, m, n, adata);
593	0		++njap; nfev+=2*m;
594			}
595	138		nu=2; updjac=0; updp=0; newjac=1;
596			}
597
598	1323	100	if(newjac){ /* Jacobian has changed, recompute J^T J, J^t e, etc */
		100
599	1269		newjac=0;
600
601			/* J^T J, J^T e */
602	1269	100	if(nm<=__BLOCKSZ__SQ){ // this is a small problem
		100
603			/* J^TJ_ij = \sum_l J^T_il J_lj = \sum_l J_li * J_lj.
604			* Thus, the product J^T J can be computed using an outer loop for
605			* l that adds J_li*J_lj to each element ij of the result. Note that
606			* with this scheme, the accesses to J and JtJ are always along rows,
607			* therefore induces less cache misses compared to the straightforward
608			* algorithm for computing the product (i.e., l loop is innermost one).
609			* A similar scheme applies to the computation of J^T e.
610			* However, for large minimization problems (i.e., involving a large number
611			* of unknowns and measurements) for which J/J^T J rows are too large to
612			* fit in the L1 cache, even this scheme incures many cache misses. In
613			* such cases, a cache-efficient blocking scheme is preferable.
614			*
615			* Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
616			* performance problem.
617			*
618			* Note that the non-blocking algorithm is faster on small
619			* problems since in this case it avoids the overheads of blocking.
620			*/
621			register int l, im;
622			register LM_REAL alpha, *jaclm;
623
624			/* looping downwards saves a few computations */
625	5745	100	for(i=m*m; i-->0; )
		100
626	4596		jacTjac[i]=0.0;
627	3447	100	for(i=m; i-->0; )
		100
628	2298		jacTe[i]=0.0;
629
630	3568	100	for(l=n; l-->0; ){
		100
631	2419		jaclm=jac+l*m;
632	7257	100	for(i=m; i-->0; ){
		100
633	4838		im=i*m;
634	4838		alpha=jaclm[i]; //jac[l*m+i];
635	12095	100	for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */
		100
636	7257		jacTjac[im+j]+=jaclm[j]alpha; //jac[lm+j]
637
638			/* J^T e */
639	4838		jacTe[i]+=alpha*e[l];
640			}
641			}
642
643	3447	100	for(i=m; i-->0; ) /* copy to upper part */
		100
644	3447	100	for(j=i+1; j
		100
645	1149		jacTjac[im+j]=jacTjac[jm+i];
646			}
647			else{ // this is a large problem
648			/* Cache efficient computation of J^T J based on blocking
649			*/
650	120		LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m);
651
652			/* cache efficient computation of J^T e */
653	410	100	for(i=0; i
		100
654	290		jacTe[i]=0.0;
655
656	989346	100	for(i=0; i
		100
657			register LM_REAL *jacrow;
658
659	3256904	100	for(l=0, jacrow=jac+i*m, tmp=e[i]; l
		100
660	2267678		jacTe[l]+=jacrow[l]*tmp;
661			}
662			}
663
664			/* Compute \|\|J^T e\|\|_inf and \|\|p\|\|^2 */
665	3857	100	for(i=0, p_L2=jacTe_inf=0.0; i
		100
666	2588	100	if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp;
		100
		100
		100
667
668	2588		diag_jacTjac[i]=jacTjac[im+i]; / save diagonal entries so that augmentation can be later canceled */
669	2588		p_L2+=p[i]*p[i];
670			}
671			//p_L2=sqrt(p_L2);
672			}
673
674			#if 0
675			if(!(k%100)){
676			printf("Current estimate: ");
677			for(i=0; i
678			printf("%.9g ", p[i]);
679			printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
680			}
681			#endif
682
683			/* check for convergence */
684	1323	50	if((jacTe_inf <= eps1)){
		50
685	0		Dp_L2=0.0; /* no increment for p in this case */
686	0		stop=1;
687	0		break;
688			}
689
690			/* compute initial damping factor */
691	1323	100	if(k==0){
		100
692	59	100	for(i=0, tmp=LM_REAL_MIN; i
		100
693	41	100	if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
		100
694	18		mu=tau*tmp;
695			}
696
697			/* determine increment using adaptive damping */
698
699			/* augment normal equations */
700	4019	100	for(i=0; i
		100
701	2696		jacTjac[i*m+i]+=mu;
702
703			/* solve augmented equations */
704			#ifdef HAVE_LAPACK
705			/* 6 alternatives are available: LU, Cholesky, 2 variants of QR decomposition, SVD and LDLt.
706			* Cholesky is the fastest but might be inaccurate; QR is slower but more accurate;
707			* SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
708			*/
709
710			issolved=AX_EQ_B_BK(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_BK;
711			//issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
712			//issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL;
713			//issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR;
714			//issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int ()(LM_REAL A, LM_REAL B, LM_REAL x, int m))AX_EQ_B_QRLS;
715			//issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;
716			#else
717			/* use the LU included with levmar */
718	1323		issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;
719			#endif /* HAVE_LAPACK */
720
721	1323		if(issolved){
722			/* compute p's new estimate and \|\|Dp\|\|^2 */
723	4019	100	for(i=0, Dp_L2=0.0; i
		100
724	2696		pDp[i]=p[i] + (tmp=Dp[i]);
725	2696		Dp_L2+=tmp*tmp;
726			}
727			//Dp_L2=sqrt(Dp_L2);
728
729	1323	100	if(Dp_L2<=eps2_sqp_L2){ / relative change in p is small, stop */
		100
730			//if(Dp_L2<=eps2(p_L2 + eps2)){ / relative change in p is small, stop */
731	5		stop=2;
732	5		break;
733			}
734
735	1318	50	if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)LM_CNST(EPSILON))){ / almost singular */
		50
736			//if(Dp_L2>=(p_L2+eps2)/LM_CNST(EPSILON)){ /* almost singular */
737	0		stop=4;
738	0		break;
739			}
740
741	1318		(func)(pDp, wrk, m, n, adata); ++nfev; / evaluate function at p + Dp */
742			/* compute \|\|e(pDp)\|\|_2 */
743			/* ### wrk2=x-wrk, pDp_eL2=\|\|wrk2\|\| */
744			#if 1
745	1318		pDp_eL2=LEVMAR_L2NRMXMY(wrk2, x, wrk, n);
746			#else
747			for(i=0, pDp_eL2=0.0; i
748			wrk2[i]=tmp=x[i]-wrk[i];
749			pDp_eL2+=tmp*tmp;
750			}
751			#endif
752	1318		if(!LM_FINITE(pDp_eL2)){ /* sum of squares is not finite, most probably due to a user error.
753			* This check makes sure that the loop terminates early in the case
754			* of invalid input. Thanks to Steve Danauskas for suggesting it
755			*/
756
757	0		stop=7;
758	0		break;
759			}
760
761	1318		dF=p_eL2-pDp_eL2;
762	1318	100	if(updp \|\| dF>0){ /* update jac */
		100
		100
		100
763	992894	100	for(i=0; i
		100
764	3264116	100	for(l=0, tmp=0.0; l
		100
765	2272486		tmp+=jac[im+l]Dp[l]; /* (J * Dp)[i] */
766	991630		tmp=(wrk[i] - hx[i] - tmp)/Dp_L2; /* (f(p+dp)[i] - f(p)[i] - (J * Dp)[i])/(dp^Tdp) /
767	3264116	100	for(j=0; j
		100
768	2272486		jac[im+j]+=tmpDp[j];
769			}
770	1264		++updjac;
771	1264		newjac=1;
772			}
773
774	4004	100	for(i=0, dL=0.0; i
		100
775	2686		dL+=Dp[i](muDp[i]+jacTe[i]);
776
777	1318	50	if(dL>0.0 && dF>0.0){ /* reduction in error, increment is accepted */
		100
		50
		100
778	886		tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0));
779	886		tmp=LM_CNST(1.0)-tmptmptmp;
780	886	100	mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) );
		100
781	886		nu=2;
782
783	2708	100	for(i=0 ; i
		100
784	1822		p[i]=pDp[i];
785
786	861751	100	for(i=0; i
		100
787	860865		e[i]=wrk2[i]; //x[i]-wrk[i];
788	860865		hx[i]=wrk[i];
789			}
790	886		p_eL2=pDp_eL2;
791	886		updp=1;
792	886		continue;
793			}
794			}
795
796			/* if this point is reached, either the linear system could not be solved or
797			* the error did not reduce; in any case, the increment must be rejected
798			*/
799
800	432		mu*=nu;
801	432		nu2=nu<<1; // 2*nu;
802	432	50	if(nu2<=nu){ /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
		50
803	0		stop=5;
804	0		break;
805			}
806	432		nu=nu2;
807
808	1296	100	for(i=0; i
		100
809	864		jacTjac[i*m+i]=diag_jacTjac[i];
810			}
811
812	18	50	if(k>=itmax) stop=3;
		50
813
814	59	100	for(i=0; i
		100
815	41		jacTjac[i*m+i]=diag_jacTjac[i];
816
817	18	50	if(info){
		50
818	18		info[0]=init_p_eL2;
819	18		info[1]=p_eL2;
820	18		info[2]=jacTe_inf;
821	18		info[3]=Dp_L2;
822	59	100	for(i=0, tmp=LM_REAL_MIN; i
		100
823	41	100	if(tmp
		100
824	18		info[4]=mu/tmp;
825	18		info[5]=(LM_REAL)k;
826	18		info[6]=(LM_REAL)stop;
827	18		info[7]=(LM_REAL)nfev;
828	18		info[8]=(LM_REAL)njap;
829	18		info[9]=(LM_REAL)nlss;
830			}
831
832			/* covariance matrix */
833	18	50	if(covar){
		50
834	18		LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
835			}
836
837
838	18	50	if(freework) free(work);
		50
839
840			#ifdef LINSOLVERS_RETAIN_MEMORY
841	18	50	if(linsolver) (*linsolver)(NULL, NULL, NULL, 0);
		50
842			#endif
843
844	18	50	return (stop!=4 && stop!=7)? k : LM_ERROR;
		50
		50
		50
845			}
846
847			/* undefine everything. THIS MUST REMAIN AT THE END OF THE FILE */
848			#undef LEVMAR_DER
849			#undef LEVMAR_DIF
850			#undef LEVMAR_FDIF_FORW_JAC_APPROX
851			#undef LEVMAR_FDIF_CENT_JAC_APPROX
852			#undef LEVMAR_COVAR
853			#undef LEVMAR_TRANS_MAT_MAT_MULT
854			#undef LEVMAR_L2NRMXMY
855			#undef AX_EQ_B_LU
856			#undef AX_EQ_B_CHOL
857			#undef AX_EQ_B_QR
858			#undef AX_EQ_B_QRLS
859			#undef AX_EQ_B_SVD
860			#undef AX_EQ_B_BK