MODULE Logistic_Regression ! A package for fitting linear logistic models by iteratively re-weighted ! least squares. ! The model fitted is that the probability of a `success' is: ! p = 1 - 1/[1 + exp(b0 + b1.X1 + b2.X2 + ... + bk.Xk)] ! where X1, X2, ... , Xk are the predictor variables, and the coefficients ! b0, b1, b2, ..., bk are to be determined. ! N.B. The residual variance used here in estimating standard errors is the ! larger of 1.0 and that from the weighted least squares calculations; ! it is not the theoretical residual variance (1.0) assuming a binomial ! distribution about the logistic curve. If the fit of the logistic ! is poor, the standard errors of the coefficients in the logistic will ! be much larger. ! The calculation of chi-squared was corrected on 9 August 2003. ! My thanks to Zhang Guan ! By Alan Miller ! amiller @ bigpond.net.au ! users.bigpond.net.au/amiller/ ! Latest revision - 9 August 2003 USE lsq IMPLICIT NONE PUBLIC :: logistic CONTAINS SUBROUTINE logistic(ngroups, x, k, s, n, chisq, devnce, ndf, beta, se_beta, & ier, cov_beta, fit, stdres) ! Input arguments: ! ngroups The number of groups of observations. ! x(:,:) x(i,j) contains the value of the j-th predictor for group i. ! The X-values are the same for all the n(i) trials in group i. ! k The number of predictors. N.B. A constant will be fitted in ! the model; do not include it in the k predictor variables. ! s(:), n(:) s(i) is the number of `successes' out of the n(i) `trials' in ! the i-th group. In many applications, each n(i) = 1, that is ! each case will be treated individually, and then s(i) = 0 or 1 ! for each of the two possible outcomes. INTEGER, INTENT(IN) :: ngroups, k, s(:), n(:) REAL (dp), INTENT(IN) :: x(:,:) ! Output arguments: ! chisq The value of chi-squared on exit, when a model has been fitted. ! devnce The deviance on exit, when a model has been fitted. ! ndf Number of degrees of freedom. ! beta(0:) The fitted coefficients in the logistic model. ! se_beta(0:) Approximate standard errors of the beta coefficients. ! ier Error indicator ! = 0 successful termination ! = 1 if ngroups < 2 or ndf < 0 ! = 2 if any n(i) < 0 ! = 3 if any r(i) < 0 ! = 4 if any r(i) > n(i) ! = 5 if any X-variable is constant ! = 6 if a singularity is detected ! = 7 if any beta(i) is tending to +/- infinity ! = 8 failure to converge in 20 iterations REAL (dp), INTENT(OUT) :: chisq, devnce, beta(0:), se_beta(0:) INTEGER, INTENT(OUT) :: ndf, ier ! Optional output arguments: ! cov_beta(0:,0:) Approximate covariance matrix of the fitted coefficients. ! fit(:) The fitted probabilities of a success for each group. ! stdres(:) Vector of standardized residuals. REAL (dp), INTENT(OUT), OPTIONAL :: cov_beta(0:,0:), fit(:), stdres(:) ! Local variables INTEGER :: i, iter, j, ncov, pos REAL (dp) :: propn(ngroups), p(ngroups), wt(ngroups), xrow(0:k), db(0:k), & bnew(0:k), dev_new, xb, pnew(ngroups), wnew(ngroups), a, b, & range(k), var, e(ngroups), hii LOGICAL :: lindep(0:k) REAL (dp), ALLOCATABLE :: covmat(:) ! Initial checks ier = 0 ndf = ngroups - k - 1 IF (ngroups < 2 .OR. ndf < 0) THEN ier = 1 RETURN END IF IF (ANY(n(1:ngroups) < 0)) THEN ier = 2 RETURN END IF IF (ANY(s(1:ngroups) < 0)) THEN ier = 3 RETURN END IF IF (ANY(s(1:ngroups) > n(1:ngroups))) THEN ier = 4 RETURN END IF ! Calculate ranges of the X-variables to use in testing whether any beta ! is tending to +/- infinity. Also test that no variable is constant. DO i = 1, k a = MAXVAL(x(1:ngroups,i)) b = MINVAL(x(1:ngroups,i)) range(i) = a - b IF (range(i) < EPSILON(0.0_dp) * (ABS(a) + ABS(b))) THEN ier = 5 RETURN END IF END DO ! Start with all beta's = 0 and weights = 1. beta(0:k) = 0.0_dp wt(1:ngroups) = 1.0_dp p(1:ngroups) = 0.5_dp ! propn stores the sample proportions, i.e. s(i) / n(i) propn(1:ngroups) = REAL(s(1:ngroups), KIND=dp) / n(1:ngroups) iter = 1 ! Start of iterative cycle DO CALL startup(k, .TRUE.) DO i = 1, ngroups IF (iter == 1) THEN xrow(0) = 0.25_dp xrow(1:k) = 0.25_dp*x(i, 1:k) ELSE xrow(0) = p(i)*(1.0_dp - p(i)) xrow(1:k) = p(i)*(1.0_dp - p(i))*x(i, 1:k) END IF CALL includ(wt(i), xrow, propn(i)-p(i)) END DO ! Test for a singularity CALL sing(lindep, ier) IF (ier /= 0) THEN DO i = 1, k IF (lindep(i)) THEN WRITE(*, '(a, i6, a)') ' Variable number ', i, & ' is linearly dependent upon earlier variables' END IF END DO ier = 6 RETURN END IF CALL regcf(db, k+1, ier) 10 bnew = beta(0:k) + db ! Calculate new p(i)'s, weights & deviance dev_new = 0.0_dp DO i = 1, ngroups xb = DOT_PRODUCT( x(i,1:k), bnew(1:k) ) + bnew(0) xb = EXP(xb) pnew(i) = xb / (1.0_dp + xb) wnew(i) = REAL(n(i), KIND=dp) / (pnew(i)*(1.0_dp - pnew(i))) IF (iter == 1) wnew(i) = SQRT(wnew(i)) IF (s(i) > 0) dev_new = dev_new + s(i)*LOG(propn(i)/pnew(i)) IF (s(i) < n(i)) dev_new = dev_new + & (n(i)-s(i))*LOG((1.0_dp-propn(i))/(1.0_dp-pnew(i))) END DO dev_new = 2 * dev_new ! If deviance has increased, reduce the step size. IF (iter > 2) THEN IF (dev_new > devnce*1.0001_dp) THEN db = 0.5_dp * db GO TO 10 END IF END IF ! Replace betas, weights & p's with new values beta(0:k) = bnew(0:k) wt = wnew p(1:ngroups) = pnew ! Test for convergence IF (iter > 2 .AND. devnce - dev_new < 0.0001_dp) EXIT devnce = dev_new iter = iter + 1 IF (iter > 20) THEN ier = 8 RETURN END IF ! Test for a very large beta DO i = 1, k IF (ABS(beta(i))*range(i) > 30.0_dp) THEN WRITE(*, '(a, i4, a)') ' Coefficient for variable no.', i, & ' tending to infinity' ier = 7 RETURN END IF END DO END DO e = n(1:ngroups)*p(1:ngroups) chisq = SUM( (s(1:ngroups) - e)**2 / (e * (1.0_dp - p(1:ngroups))) ) devnce = dev_new ! Calculate the approximate covariance matrix for the beta's, if ndf > 0. IF (ndf > 0) THEN ncov = (k+1)*(k+2)/2 ALLOCATE( covmat(ncov) ) CALL cov(k+1, var, covmat, ncov, se_beta, ier) IF (var < 1.0_dp) THEN covmat = covmat / var se_beta = se_beta / SQRT(var) END IF IF(PRESENT(cov_beta)) THEN pos = 1 DO i = 0, k cov_beta(i,i) = covmat(pos) pos = pos + 1 DO j = i+1, k cov_beta(i,j) = covmat(pos) cov_beta(j,i) = covmat(pos) pos = pos + 1 END DO END DO END IF END IF IF(PRESENT(fit)) fit(1:ngroups) = p IF (PRESENT(stdres)) THEN DO i = 1, ngroups xrow(0) = p(i)*(1.0_dp - p(i)) xrow(1:k) = p(i)*(1.0_dp - p(i))*x(i, 1:k) CALL hdiag(xrow, k+1, hii, ier) stdres(i) = (s(i) - n(i)*p(i)) / & SQRT(n(i)*p(i)*(1.0_dp - p(i))*(1.0_dp - hii)) END DO END IF IF (ALLOCATED(covmat)) DEALLOCATE( covmat ) RETURN END SUBROUTINE logistic FUNCTION chi_squared(ndf, chi2) RESULT(prob) ! Calculate the chi-squared distribution function ! ndf = number of degrees of freedom ! chi2 = chi-squared value ! prob = probability of a chi-squared value <= chi2 (i.e. the left-hand ! tail area) INTEGER, INTENT(IN) :: ndf REAL (dp), INTENT(IN) :: chi2 REAL (dp) :: prob ! Local variables REAL (dp) :: half = 0.5_dp, x, p x = half * chi2 p = half * REAL(ndf) prob = gammad(x, p) RETURN END FUNCTION chi_squared FUNCTION gammad(x, p) RESULT(gamma_prob) ! ALGORITHM AS239 APPL. STATIST. (1988) VOL. 37, NO. 3 ! Computation of the Incomplete Gamma Integral ! Auxiliary functions required: ALNORM = algorithm AS66 (included) & LNGAMMA ! Converted to be compatible with ELF90 by Alan Miller ! N.B. The return parameter IFAULT has been removed as ELF90 allows only ! one output parameter from functions. An error message is issued instead. ! This revision - 15 October 1996 REAL (dp), INTENT(IN) :: x, p REAL (dp) :: gamma_prob ! Local variables REAL (dp) :: pn1, pn2, pn3, pn4, pn5, pn6, tol = 1.d-14, oflo = 1.d+37, & xbig = 1.d+8, arg, c, rn, a, b, one = 1._dp, zero = 0._dp, an, & two = 2._dp, elimit = -88._dp, plimit = 1000._dp, three = 3._dp, & nine = 9._dp gamma_prob = zero ! Check that we have valid values for X and P IF (p <= zero .OR. x < zero) THEN WRITE(*, *)'Error: Function gammad. 1st argument < 0 or 2nd argument <= 0' RETURN END IF IF (x == zero) RETURN ! Use a normal approximation if P > PLIMIT IF (p > plimit) THEN pn1 = three * SQRT(p) * ((x / p) ** (one / three) + one / (nine * p) - one) gamma_prob = alnorm(pn1, .false.) RETURN END IF ! If X is extremely large compared to P then set gamma_prob = 1 IF (x > xbig) THEN gamma_prob = one RETURN END IF IF (x <= one .OR. x < p) THEN ! Use Pearson's series expansion. ! (Note that P is not large enough to force overflow in LNGAMMA) arg = p * LOG(x) - x - lngamma(p + one) c = one gamma_prob = one a = p DO a = a + one c = c * x / a gamma_prob = gamma_prob + c IF (c < tol) EXIT END DO arg = arg + LOG(gamma_prob) gamma_prob = zero IF (arg >= elimit) gamma_prob = EXP(arg) ELSE ! Use a continued fraction expansion arg = p * LOG(x) - x - lngamma(p) a = one - p b = a + x + one c = zero pn1 = one pn2 = x pn3 = x + one pn4 = x * b gamma_prob = pn3 / pn4 DO a = a + one b = b + two c = c + one an = a * c pn5 = b * pn3 - an * pn1 pn6 = b * pn4 - an * pn2 IF (ABS(pn6) > zero) THEN rn = pn5 / pn6 IF (ABS(gamma_prob - rn) <= MIN(tol, tol * rn)) EXIT gamma_prob = rn END IF pn1 = pn3 pn2 = pn4 pn3 = pn5 pn4 = pn6 IF (ABS(pn5) >= oflo) THEN ! Re-scale terms in continued fraction if terms are large pn1 = pn1 / oflo pn2 = pn2 / oflo pn3 = pn3 / oflo pn4 = pn4 / oflo END IF END DO arg = arg + LOG(gamma_prob) gamma_prob = one IF (arg >= elimit) gamma_prob = one - EXP(arg) END IF RETURN END FUNCTION gammad FUNCTION alnorm(x, upper) RESULT(norm_prob) ! Algorithm AS66 Applied Statistics (1973) vol.22, no.3 ! Evaluates the tail area of the standardised normal curve ! from x to infinity if upper is .true. or ! from minus infinity to x if upper is .false. REAL (dp), INTENT(IN) :: x LOGICAL, INTENT(IN) :: upper REAL (dp) :: norm_prob ! Local variables REAL (dp) :: zero = 0.0_dp, one = 1.0_dp, half = 0.5_dp REAL (dp) :: con = 1.28_dp, z, y, ltone = 7.0_dp, utzero = 18.66_dp REAL (dp) :: p = 0.398942280444_dp, q = 0.39990348504_dp, & r = 0.398942280385_dp, a1 = 5.75885480458_dp, & a2 = 2.62433121679_dp, a3 = 5.92885724438_dp, & b1 = -29.8213557807_dp, b2 = 48.6959930692_dp, & c1 = -3.8052D-8, c2 = 3.98064794D-4, & c3 = -0.151679116635_dp, c4 = 4.8385912808_dp, & c5 = 0.742380924027_dp, c6 = 3.99019417011_dp, & d1 = 1.00000615302_dp, d2 = 1.98615381364_dp, & d3 = 5.29330324926_dp, d4 = -15.1508972451_dp, & d5 = 30.789933034_dp LOGICAL :: up up = upper z = x IF(z >= zero) GO TO 10 up = .NOT. up z = -z 10 IF(z <= ltone .OR. up .AND. z <= utzero) GO TO 20 norm_prob = zero GO TO 40 20 y = half*z*z IF(z > con) GO TO 30 norm_prob = half - z*(p-q*y/(y+a1+b1/(y+a2+b2/(y+a3)))) GO TO 40 30 norm_prob = r*EXP(-y)/(z+c1+d1/(z+c2+d2/(z+c3+d3/(z+c4+d4/(z+c5+d5/(z+c6)))))) 40 IF(.NOT. up) norm_prob = one - norm_prob RETURN END FUNCTION alnorm FUNCTION lngamma(z) RESULT(lanczos) ! Uses Lanczos-type approximation to ln(gamma) for z > 0. ! Reference: ! Lanczos, C. 'A precision approximation of the gamma ! function', J. SIAM Numer. Anal., B, 1, 86-96, 1964. ! Accuracy: About 14 significant digits except for small regions ! in the vicinity of 1 and 2. ! Programmer: Alan Miller ! 1 Creswick Street, Brighton, Vic. 3187, Australia ! Latest revision - 14 October 1996 REAL(dp), INTENT(IN) :: z REAL(dp) :: lanczos ! Local variables REAL(dp) :: a(9) = (/ 0.9999999999995183_dp, 676.5203681218835_dp, & -1259.139216722289_dp, 771.3234287757674_dp, & -176.6150291498386_dp, 12.50734324009056_dp, & -0.1385710331296526_dp, 0.9934937113930748D-05, & 0.1659470187408462D-06 /), zero = 0._dp, & one = 1._dp, lnsqrt2pi = 0.9189385332046727_dp, & half = 0.5_dp, sixpt5 = 6.5_dp, seven = 7._dp, tmp INTEGER :: j IF (z <= zero) THEN WRITE(*, *)'Error: zero or -ve argument for lngamma' RETURN END IF lanczos = zero tmp = z + seven DO j = 9, 2, -1 lanczos = lanczos + a(j)/tmp tmp = tmp - one END DO lanczos = lanczos + a(1) lanczos = LOG(lanczos) + lnsqrt2pi - (z + sixpt5) + (z - half)*LOG(z + sixpt5) RETURN END FUNCTION lngamma END MODULE Logistic_Regression