Skip to content

interpretation of scores/statistics #13

@bertsky

Description

@bertsky

I am trying to make sense of the scores output provided here (which puzzles me over what I am used to from pycocotools itself, even more so after I have made the iouThrs param freely changeable).

Apparently you have made some modifications to cocoeval...

--- ../pycocotools/PythonAPI/pycocotools/cocoeval.py    2020-04-02 12:41:09.770697735 +0200
+++ pycoco.py   2021-02-16 13:16:39.248434628 +0100
@@ -1,12 +1,11 @@
-__author__ = 'tsungyi'
-
 import numpy as np
 import datetime
 import time
 from collections import defaultdict
-from . import mask as maskUtils
+from pycocotools import mask as maskUtils
 import copy
 
+
 class COCOeval:
     # Interface for evaluating detection on the Microsoft COCO dataset.
     #
@@ -68,7 +67,6 @@
             print('iouType not specified. use default iouType segm')
         self.cocoGt   = cocoGt              # ground truth COCO API
         self.cocoDt   = cocoDt              # detections COCO API
-        self.params   = {}                  # evaluation parameters
         self.evalImgs = defaultdict(list)   # per-image per-category evaluation results [KxAxI] elements
         self.eval     = {}                  # accumulated evaluation results
         self._gts = defaultdict(list)       # gt for evaluation
@@ -203,21 +202,26 @@
         if len(gts) == 0 or len(dts) == 0:
             return []
         ious = np.zeros((len(dts), len(gts)))
-        sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
+        sigmas = p.kpt_oks_sigmas
         vars = (sigmas * 2)**2
         k = len(sigmas)
         # compute oks between each detection and ground truth object
         for j, gt in enumerate(gts):
             # create bounds for ignore regions(double the gt bbox)
             g = np.array(gt['keypoints'])
-            xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
+            xg = g[0::3];
+            yg = g[1::3];
+            vg = g[2::3]
             k1 = np.count_nonzero(vg > 0)
             bb = gt['bbox']
-            x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
-            y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
+            x0 = bb[0] - bb[2];
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3];
+            y1 = bb[1] + bb[3] * 2
             for i, dt in enumerate(dts):
                 d = np.array(dt['keypoints'])
-                xd = d[0::3]; yd = d[1::3]
+                xd = d[0::3];
+                yd = d[1::3]
                 if k1>0:
                     # measure the per-keypoint distance if keypoints visible
                     dx = xd - xg
@@ -334,6 +338,7 @@
         M           = len(p.maxDets)
         precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
         recall      = -np.ones((T,K,A,M))
+        scores = -np.ones((T, R, K, A, M))
 
         # create dictionary for future indexing
         _pe = self._paramsEval
@@ -364,6 +369,7 @@
                     # different sorting method generates slightly different results.
                     # mergesort is used to be consistent as Matlab implementation.
                     inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
 
                     dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
                     dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
@@ -383,6 +389,7 @@
                         rc = tp / npig
                         pr = tp / (fp+tp+np.spacing(1))
                         q  = np.zeros((R,))
+                        ss = np.zeros((R,))
 
                         if nd:
                             recall[t,k,a,m] = rc[-1]
@@ -391,7 +398,8 @@
 
                         # numpy is slow without cython optimization for accessing elements
                         # use python array gets significant speed improvement
-                        pr = pr.tolist(); q = q.tolist()
+                        pr = pr.tolist();
+                        q = q.tolist()
 
                         for i in range(nd-1, 0, -1):
                             if pr[i] > pr[i-1]:
@@ -401,15 +409,18 @@
                         try:
                             for ri, pi in enumerate(inds):
                                 q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
                         except:
                             pass
                         precision[t,:,k,a,m] = np.array(q)
+                        scores[t, :, k, a, m] = np.array(ss)
         self.eval = {
             'params': p,
             'counts': [T, R, K, A, M],
             'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
             'precision': precision,
             'recall':   recall,
+            'scores': scores,
         }
         toc = time.time()
         print('DONE (t={:0.2f}s).'.format( toc-tic))
@@ -419,7 +430,12 @@
         Compute and display summary metrics for evaluation results.
         Note this functin can *only* be applied on the default parameter setting
         '''
+
+        self.per_class_precisions = []
+        self.ap_per_class_columns = []
+
         def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
+            print(ap, iouThr, areaRng, maxDets)
             p = self.params
             iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
             titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
@@ -448,8 +464,22 @@
                 mean_s = -1
             else:
                 mean_s = np.mean(s[s>-1])
+
+                # cacluate AP(average precision) for each category
+                num_classes = 80
+                avg_ap = 0.0
+                if ap == 1:
+                    pcp = {}
+                    for i, c in enumerate(sorted(list(self.cocoDt.cats.values()), key=lambda x: x['id'])):
+                        pcp[c['name']] = np.mean(s[:, :, i, :])
+
+                    self.per_class_precisions.append(pcp)
+                    self.ap_per_class_columns.append(f"ap={ap} iouThr={iouThr or '0.5:0.95'} area={areaRng} maxDets={maxDets}")
+
+
             print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
             return mean_s
+
         def _summarizeDets():
             stats = np.zeros((12,))
             stats[0] = _summarize(1)
@@ -494,12 +527,13 @@
     '''
     Params for coco evaluation api
     '''
+
     def setDetParams(self):
         self.imgIds = []
         self.catIds = []
         # np.arange causes trouble.  the data point on arange is slightly larger than the true value
-        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
-        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05) + 1), endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01) + 1), endpoint=True)
         self.maxDets = [1, 10, 100]
         self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
         self.areaRngLbl = ['all', 'small', 'medium', 'large']
@@ -509,12 +543,14 @@
         self.imgIds = []
         self.catIds = []
         # np.arange causes trouble.  the data point on arange is slightly larger than the true value
-        self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
-        self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05) + 1), endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01) + 1), endpoint=True)
         self.maxDets = [20]
         self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
         self.areaRngLbl = ['all', 'medium', 'large']
         self.useCats = 1
+        self.kpt_oks_sigmas = np.array(
+            [.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0
 
     def __init__(self, iouType='segm'):
         if iouType == 'segm' or iouType == 'bbox':

First, some observations:

  • in the per_class_precisions aggregator, the mean includes empty cells represented as -1 (which distorts the average numerically; needs the equivalent of s[s > -1])
  • in its ap_per_class_columns title, the iouThr prints a fixed interval instead of the actual range substituted for None via parameters (iouStr would be correct here IIUC)
  • by making that calculation indented (i.e. dependent on len(s[s > -1]) > 0), only columns with matches will be shown (which can be confusing)
  • the Average mAP by class then is merely a contraction (df.mean(axis=1)) of that table, i.e. a macro-average, but IINM the only correct way to average over all axes/ranges is to aggregate by your respective criteria directly, i.e. micro-average; in this case, you probably just want to pick the first column (which contains an average over all IoU, all recall, all area, all number of detections)
  • it would probably be more interesting to get the precision at the maximum recall and then show both precision and recall side by side (or pick some other non-avg operating point, like the largest sum / product / F1 / MCC)
  • you can make these by-category averages without modifying pycocotools at all, simply by inspecting its eval table before calling summarize(); here's an example for the max-recall operating point per category:
    recalls = self.cocoeval.eval['recall'][0,:,0,-1] # at min-IoU, all-area, max-detections
    recallInds = np.searchsorted(self.cocoeval.params.recThrs, recalls) - 1
    classInds = np.arange(len(recalls))
    precisions = self.cocoeval.eval['precision'][0,recallInds,classInds,0,-1]
    catIds = self.coco_gt.getCatIds()
    for id_, cat in self.coco_gt.cats.items():
        name = cat['name']
        i = catIds.index(id_)
        print(name + ' prc: ' + str(precisions[i]))
        print(name + ' rec: ' + str(recalls[i]))
  • I don't understand the dtScoresSorted and scores modification TBH, but it does not seem to be used anywhere. Could it be this is just an earlier attempt at what you do with per_image_scores?
  • If so, perhaps the custom pycoco.py could be removed entirely, depending solely on pycocotools.cocoeval?
  • In the per_image_scores calculation, there are several non-numeric fields (like the lists of scores which are true positives, false positives, false negatives, or the list of IoUs of GT regions, or the list of categories). But they are all gone in the displayed table – IIUC because the sum() removes them. Isn't there a way to keep both the sums and the lists/columns in the table? I don't know much about pandas TBH. (But as it is, the table shows me the numerical sum of all categories.)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions