Intel(r) Performance Counter Monitor
cpucounters.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2009-2013, Intel Corporation
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6 
7  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9  * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 
11 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 */
13 // written by Roman Dementiev
14 // Thomas Willhalm
15 
16 #ifndef CPUCOUNTERS_HEADER
17 #define CPUCOUNTERS_HEADER
18 
25 #define INTEL_PCM_VERSION "V2.8 ($Format:%ci ID=%h$)"
26 
27 #ifndef INTELPCM_API
28 #define INTELPCM_API
29 #endif
30 
31 #include "types.h"
32 #include "msr.h"
33 #include "pci.h"
34 #include "client_bw.h"
35 #include "width_extender.h"
36 #include <vector>
37 #include <limits>
38 #include <string>
39 #include <string.h>
40 
41 #ifdef PCM_USE_PERF
42 #include <linux/perf_event.h>
43 #include <sys/syscall.h>
44 #include <errno.h>
45 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
46 #endif
47 
48 #ifndef _MSC_VER
49 #define NOMINMAX
50 #include <semaphore.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <fcntl.h>
54 #include <unistd.h>
55 #endif
56 
57 class SystemCounterState;
58 class SocketCounterState;
59 class CoreCounterState;
60 class BasicCounterState;
62 class PCM;
63 
64 /*
65  CPU performance monitoring routines
66 
67  A set of performance monitoring routines for recent Intel CPUs
68 */
69 
70 struct INTELPCM_API TopologyEntry // decribes a core
71 {
72  int32 os_id;
73  int32 socket;
74  int32 core_id;
75 
76  TopologyEntry() : os_id(-1), socket(-1), core_id(-1) { }
77 };
78 
81 {
82  int bus, groupnr;
83  PciHandleM ** imcHandles;
84  uint32 num_imc_channels;
85 
86  PciHandleM ** qpiLLHandles;
87  uint32 num_qpi_ports;
88  std::vector<uint64> qpi_speed;
89  uint32 num_imc;
90  uint32 MCX_CHY_REGISTER_DEV_ADDR[2][4];
91  uint32 MCX_CHY_REGISTER_FUNC_ADDR[2][4];
92  uint32 QPI_PORTX_REGISTER_DEV_ADDR[3];
93  uint32 QPI_PORTX_REGISTER_FUNC_ADDR[3];
94 
95  static std::vector<std::pair<uint32,uint32> > socket2bus;
96  void initSocket2Bus();
97 
98  ServerPCICFGUncore(); // forbidden
99  ServerPCICFGUncore(ServerPCICFGUncore &); // forbidden
100  PciHandleM * createIntelPerfMonDevice(uint32 groupnr, uint32 bus, uint32 dev, uint32 func, bool checkVendor = false);
101 
102 public:
106  ServerPCICFGUncore(uint32 socket_, PCM * pcm);
108  void program();
110  uint64 getImcReads();
112  uint64 getImcWrites();
113 
116  uint64 getIncomingDataFlits(uint32 port);
117 
120  uint64 getOutgoingDataNonDataFlits(uint32 port);
121 
122  virtual ~ServerPCICFGUncore();
123 
126  void program_power_metrics(int mc_profile);
127 
130  uint64 getQPIClocks(uint32 port);
131 
134  uint64 getQPIL0pTxCycles(uint32 port);
137  uint64 getQPIL1Cycles(uint32 port);
140  uint64 getDRAMClocks(uint32 channel);
144  uint64 getMCCounter(uint32 channel, uint32 counter);
148  uint64 getQPILLCounter(uint32 port, uint32 counter);
149 
151  void freezeCounters();
153  void unfreezeCounters();
154 
156  uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel);
157 
159  void enableJKTWorkaround(bool enable);
160 
162  uint32 getNumQPIPorts() const { return num_qpi_ports; }
163 
165  uint64 getQPILinkSpeed(const uint32 linkNr) const {
166  return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
167  }
168 
170  void reportQPISpeed() const
171  {
172  std::cerr.precision(1);
173  std::cerr << std::fixed;
174  for (uint32 i=0; i<qpi_speed.size(); ++i)
175  std::cerr << "Max QPI link " << i << " speed: " << qpi_speed[i] / (1e9) << " GBytes/second (" << qpi_speed[i] / (2e9) << " GT/second)" << std::endl;
176  }
177 
179  uint32 getNumMC() const { return num_imc; }
180 
182  uint32 getNumMCChannels() const { return num_imc_channels; }
183 };
184 
186 {
187  friend uint64 getNumberOfEvents(PCIeCounterState before, PCIeCounterState after);
188  friend class PCM;
189  uint64 data;
190 public:
191  PCIeCounterState(): data(0)
192  {
193  }
194  virtual ~PCIeCounterState() {}
195 };
196 
197 #ifndef HACK_TO_REMOVE_DUPLICATE_ERROR
198 template class INTELPCM_API std::allocator<TopologyEntry>;
199 template class INTELPCM_API std::vector<TopologyEntry>;
200 template class INTELPCM_API std::allocator<CounterWidthExtender*>;
201 template class INTELPCM_API std::vector<CounterWidthExtender*>;
202 template class INTELPCM_API std::allocator<uint32>;
203 template class INTELPCM_API std::vector<uint32>;
204 template class INTELPCM_API std::allocator<char>;
205 #endif
206 
212 class INTELPCM_API PCM
213 {
214  friend class BasicCounterState;
215  friend class UncoreCounterState;
216  PCM(); // forbidden to call directly because it is a singleton
217 
218  int32 cpu_family;
219  int32 cpu_model, original_cpu_model;
220  int32 threads_per_core;
221  int32 num_cores;
222  int32 num_sockets;
223  int32 num_phys_cores_per_socket;
224  int32 num_online_cores;
225  uint32 core_gen_counter_num_max;
226  uint32 core_gen_counter_num_used;
227  uint32 core_gen_counter_width;
228  uint32 core_fixed_counter_num_max;
229  uint32 core_fixed_counter_num_used;
230  uint32 core_fixed_counter_width;
231  uint32 uncore_gen_counter_num_max;
232  uint32 uncore_gen_counter_num_used;
233  uint32 uncore_gen_counter_width;
234  uint32 uncore_fixed_counter_num_max;
235  uint32 uncore_fixed_counter_num_used;
236  uint32 uncore_fixed_counter_width;
237  int32 perfmon_version;
238  int32 perfmon_config_anythread;
239  uint64 nominal_frequency;
240  uint64 max_qpi_speed; // in GBytes/second
241  uint32 L3ScalingFactor;
242  int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
243 
244  std::vector<TopologyEntry> topology;
245  std::string errorMessage;
246 
247  static PCM * instance;
248  bool allow_multiple_instances;
249  bool programmed_pmu;
250  SafeMsrHandle ** MSR;
251  ServerPCICFGUncore ** server_pcicfg_uncore;
252  uint32 PCU_MSR_PMON_BOX_CTL_ADDR, PCU_MSR_PMON_CTRX_ADDR[4];
253  double joulesPerEnergyUnit;
254  std::vector<CounterWidthExtender*> snb_energy_status;
255  std::vector<CounterWidthExtender*> jkt_dram_energy_status;
256 
257 
258  ClientBW * clientBW;
259  CounterWidthExtender * clientImcReads;
260  CounterWidthExtender * clientImcWrites;
261  CounterWidthExtender * clientIoRequests;
262 
263  bool disable_JKT_workaround;
264  bool blocked; // track if time-driven counter update is running or not: PCM is blocked
265 
266  uint64 * coreCStateMsr; // MSR addresses of core C-state free-running counters
267  uint64 * pkgCStateMsr; // MSR addresses of package C-state free-running counters
268 
269 public:
270  enum { MAX_C_STATE = 10 }; // max C-state on Intel architecture
271 
274  {
275  if (state == 0 || state == 1)
276  return true;
277 
278  return (coreCStateMsr != NULL && state <= MAX_C_STATE && coreCStateMsr[state] != 0);
279  }
280 
283  {
284  return (pkgCStateMsr != NULL && state <= MAX_C_STATE && pkgCStateMsr[state] != 0);
285  }
286 
288  void setOutput(const std::string filename);
289 
291  void restoreOutput();
292 
294  // Arguments:
295  // -- 1 - program is running
296  // -- 0 -pgram is sleeping
297  void setRunState(int new_state) { run_state = new_state; }
298 
300  // Results:
301  // -- 1 - program is running
302  // -- 0 -pgram is sleeping
303  int getRunState(void) { return run_state; }
304 
305  bool isBlocked(void) { return blocked; }
306  void setBlocked(const bool new_blocked) { blocked = new_blocked; }
307 
310  {
311  allow_multiple_instances = true;
312  }
313 
315  enum ProgramMode {
320  };
321 
323  enum ErrorCode {
324  Success = 0,
325  MSRAccessDenied = 1,
326  PMUBusy = 2,
327  UnknownError
328  };
329 
338  {
339  int32 event_number, umask_value;
340  };
341 
352  {
353  FixedEventControlRegister * fixedCfg; // if NULL, then default configuration performed for fixed counters
354  uint32 nGPCounters; // number of general purpose counters
355  EventSelectRegister * gpCounterCfg; // general purpose counters, if NULL, then default configuration performed for GP counters
356  uint64 OffcoreResponseMsrValue[2];
357  };
358 
359 private:
360  ProgramMode mode;
361  CustomCoreEventDescription coreEventDesc[4];
362 
363  #ifdef _MSC_VER
364  HANDLE numInstancesSemaphore; // global semaphore that counts the number of PCM instances on the system
365  #else
366  // global semaphore that counts the number of PCM instances on the system
367  sem_t * numInstancesSemaphore;
368  #endif
369 
370  std::vector<int32> socketRefCore;
371 
372  bool canUsePerf;
373 #ifdef PCM_USE_PERF
374  std::vector< std::vector<int> > perfEventHandle;
375  void readPerfData(uint32 core, std::vector<uint64> & data);
376 
377  enum {
378  PERF_INST_RETIRED_ANY_POS = 0,
379  PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
380  PERF_CPU_CLK_UNHALTED_REF_POS = 2,
381  PERF_GEN_EVENT_0_POS = 3,
382  PERF_GEN_EVENT_1_POS = 4,
383  PERF_GEN_EVENT_2_POS = 5,
384  PERF_GEN_EVENT_3_POS = 6
385  };
386 
387  enum {
388  PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_ANY_POS
389  };
390 #endif
391  std::ofstream *outfile; // output file stream
392  std::streambuf *backup_ofile; // backup of original output = cout
393  int run_state; // either running (1) or sleeping (0)
394 
395  bool PMUinUse();
396  void cleanupPMU();
397  void freeRMID();
398  bool decrementInstanceSemaphore(); // returns true if it was the last instance
399 
400 #ifdef __APPLE__
401  // OSX does not have sem_getvalue, so we must get the number of instances by a different method
402  uint32 getNumInstances();
403  uint32 decrementNumInstances();
404  uint32 incrementNumInstances();
405 #endif
406 
407 
408  void computeQPISpeedBeckton(int core_nr);
409  void destroyMSR();
410  void computeNominalFrequency();
411  static bool isCPUModelSupported(int model_);
412  std::string getSupportedUarchCodenames() const;
413  std::string getUnsupportedMessage() const;
414  bool detectModel();
415  bool checkModel();
416 
417  void initCStateSupportTables();
418  bool discoverSystemTopology();
419  void printSystemTopology() const;
420  void initMSR();
421  bool detectNominalFrequency();
422  void initEnergyMonitoring();
423  void initUncoreObjects();
429  void initL3CacheOccupancyMonitoring();
430  void programBecktonUncore(int core);
431  void programNehalemEPUncore(int core);
432  void enableJKTWorkaround(bool enable);
433  template <class CounterStateType>
434  void readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & counterState);
435  template <class CounterStateType>
436  void readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & counterState);
437  template <class CounterStateType>
438  void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState);
439  template <class CounterStateType>
440  void readAndAggregatePackageCStateResidencies(SafeMsrHandle * msr, CounterStateType & result);
441  void readQPICounters(SystemCounterState & counterState);
442  void reportQPISpeed() const;
443 
444  uint32 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const;
445  uint32 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const;
446  uint32 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const;
447  uint32 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const;
448  uint32 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
449  uint32 getMaxNumOfCBoxes() const;
450  void programCboOpcodeFilter(const uint32 opc, const uint32 cbo, SafeMsrHandle * msr);
451 
452 public:
459 
465  unsigned getMaxRMID() const;
466 
478  static PCM * getInstance(); // the only way to get access
479 
487  bool good(); // true if access to CPU counters works
488 
493  const std::string & getErrorMessage() const
494  {
495  return errorMessage;
496  }
497 
509  ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL); // program counters and start counting
510 
524  ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL);
525 
528 
531 
537 
543  void cleanup();
544 
549  void resetPMU();
550 
558  void getAllCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates);
559 
564  bool isCoreOnline(int32 os_core_id) const;
565 
574 
580 
590 
594  uint32 getNumCores();
595 
599  uint32 getNumOnlineCores();
600 
604  uint32 getNumSockets();
605 
611  uint32 getThreadsPerCore();
612 
616  bool getSMT(); // returns true iff SMT ("Hyperthreading") is on
617 
621  uint64 getNominalFrequency(); // in Hz
622 
627  uint32 getL3ScalingFactor();
628 
631  {
632  NEHALEM_EP = 26,
633  NEHALEM = 30,
634  ATOM = 28,
635  ATOM_2 = 53,
636  ATOM_CENTERTON = 54,
637  ATOM_BAYTRAIL = 55,
638  ATOM_AVOTON = 77,
639  CLARKDALE = 37,
640  WESTMERE_EP = 44,
641  NEHALEM_EX = 46,
642  WESTMERE_EX = 47,
643  SANDY_BRIDGE = 42,
644  JAKETOWN = 45,
645  IVY_BRIDGE = 58,
646  HASWELL = 60,
647  HASWELL_ULT = 69,
648  HASWELL_2 = 70,
649  IVYTOWN = 62,
650  HASWELLX = 63,
651  BROADWELL = 61,
652  END_OF_MODEL_LIST = 0x0ffff
653  };
654 
657  uint32 getCPUModel() { return cpu_model; }
658 
661  uint32 getOriginalCPUModel() { return original_cpu_model; }
662 
666  int32 getSocketId(uint32 core_id)
667  {
668  return topology[core_id].socket;
669  }
670 
673  uint64 getQPILinksPerSocket() const
674  {
675  switch (cpu_model)
676  {
677  case NEHALEM_EP:
678  case WESTMERE_EP:
679  case CLARKDALE:
680  if (num_sockets == 2)
681  return 2;
682  else
683  return 1;
684  case NEHALEM_EX:
685  case WESTMERE_EX:
686  return 4;
687  case JAKETOWN:
688  case IVYTOWN:
689  case HASWELLX:
690  return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumQPIPorts()):0;
691  }
692  return 0;
693  }
694 
696  uint32 getMCPerSocket() const
697  {
698  switch (cpu_model)
699  {
700  case NEHALEM_EP:
701  case WESTMERE_EP:
702  case CLARKDALE:
703  return 1;
704  case NEHALEM_EX:
705  case WESTMERE_EX:
706  return 2;
707  case JAKETOWN:
708  case IVYTOWN:
709  case HASWELLX:
710  return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumMC()):0;
711  }
712  return 0;
713  }
714 
716  uint32 getMCChannelsPerSocket() const
717  {
718  switch (cpu_model)
719  {
720  case NEHALEM_EP:
721  case WESTMERE_EP:
722  case CLARKDALE:
723  return 3;
724  case NEHALEM_EX:
725  case WESTMERE_EX:
726  return 4;
727  case JAKETOWN:
728  case IVYTOWN:
729  case HASWELLX:
730  return (server_pcicfg_uncore && server_pcicfg_uncore[0])?(server_pcicfg_uncore[0]->getNumMCChannels()):0;
731  }
732  return 0;
733  }
734 
737  uint32 getMaxIPC() const
738  {
739  switch (cpu_model)
740  {
741  case NEHALEM_EP:
742  case WESTMERE_EP:
743  case NEHALEM_EX:
744  case WESTMERE_EX:
745  case CLARKDALE:
746  case SANDY_BRIDGE:
747  case JAKETOWN:
748  case IVYTOWN:
749  case IVY_BRIDGE:
750  case HASWELL:
751  case HASWELLX:
752  case BROADWELL:
753  return 4;
754  case ATOM:
755  return 2;
756  }
757  return 0;
758  }
759 
761  uint64 getPCUFrequency() const
762  {
763  switch (cpu_model)
764  {
765  case JAKETOWN:
766  case IVYTOWN:
767  return 800000000ULL; // 800 MHz
768  case HASWELLX:
769  return 1000000000ULL; // 1 GHz
770  }
771  return 0;
772  }
773 
778  uint64 getTickCount(uint64 multiplier = 1000 /* ms */, uint32 core = 0);
779 
784  uint64 getTickCountRDTSCP(uint64 multiplier = 1000 /* ms */);
785 
786 
790  uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
791  { return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed; }
792 
794  double getJoulesPerEnergyUnit() const { return joulesPerEnergyUnit; }
795 
797  int32 getPackageThermalSpecPower() const { return pkgThermalSpecPower; }
798 
800  int32 getPackageMinimumPower() const { return pkgMinimumPower; }
801 
803  int32 getPackageMaximumPower() const { return pkgMaximumPower; }
804 
807  static bool initWinRing0Lib();
808 
809  inline void disableJKTWorkaround() { disable_JKT_workaround = true; }
810 
811  enum PCIeEventCode
812  {
813  // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
814  PCIeRdCur = 0x19E, // PCIe read current (full cache line)
815  PCIeNSRd = 0x1E4, // PCIe non-snoop read (full cache line)
816  // PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device)
817  PCIeWiLF = 0x194, // PCIe Write (non-allocating) (full cache line)
818  PCIeItoM = 0x19C, // PCIe Write (allocating) (full cache line)
819  PCIeNSWr = 0x1E5, // PCIe Non-snoop write (partial cache line)
820  PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line)
821  // events shared by CPU and IO
822  RFO = 0x180, // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic
823  CRd = 0x181, // Demand Code Read
824  DRd = 0x182, // Demand Data Read
825  PRd = 0x187, // Partial Reads (UC) (MMIO Read)
826  WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
827  ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
828  };
829 
830  enum CBoEventTid
831  {
832  RFOtid = 0x3E,
833  ItoMtid = 0x3E,
834  };
835 
839  void programPCIeCounters(const PCIeEventCode event_, const uint32 tid_ = 0, const uint32 miss_ = 0);
840  void programPCIeMissCounters(const PCIeEventCode event_, const uint32 tid_ = 0);
841 
845  PCIeCounterState getPCIeCounterState(const uint32 socket_);
846 
847  uint64 extractCoreGenCounterValue(uint64 val);
848  uint64 extractCoreFixedCounterValue(uint64 val);
849  uint64 extractUncoreGenCounterValue(uint64 val);
850  uint64 extractUncoreFixedCounterValue(uint64 val);
851  uint64 extractL3CacheOccupancy(uint64 val);
852 
855  const char * getUArchCodename(int32 cpu_model_ = -1) const;
856 
858  static std::string getCPUBrandString();
859 
860  bool packageEnergyMetricsAvailable() const
861  {
862  return (
863  cpu_model == PCM::JAKETOWN
864  || cpu_model == PCM::IVYTOWN
865  || cpu_model == PCM::SANDY_BRIDGE
866  || cpu_model == PCM::IVY_BRIDGE
867  || cpu_model == PCM::HASWELL
868  || original_cpu_model == PCM::ATOM_AVOTON
869  || cpu_model == PCM::HASWELLX
870  || cpu_model == PCM::BROADWELL
871  );
872  }
873 
874  bool dramEnergyMetricsAvailable() const
875  {
876  return (
877  cpu_model == PCM::JAKETOWN
878  || cpu_model == PCM::IVYTOWN
879  || cpu_model == PCM::HASWELLX
880  );
881  }
882 
883  bool packageThermalMetricsAvailable() const
884  {
885  return packageEnergyMetricsAvailable();
886  }
887 
888  bool outgoingQPITrafficMetricsAvailable() const
889  {
890  return (
891  cpu_model == PCM::NEHALEM_EX
892  || cpu_model == PCM::WESTMERE_EX
893  || cpu_model == PCM::JAKETOWN
894  || cpu_model == PCM::IVYTOWN
895  || cpu_model == PCM::HASWELLX
896  );
897  }
898 
899  bool qpiUtilizationMetricsAvailable() const
900  {
901  return outgoingQPITrafficMetricsAvailable();
902  }
903 
904  bool memoryTrafficMetricsAvailable() const
905  {
906  return !(
907  cpu_model == PCM::ATOM
908  || cpu_model == PCM::CLARKDALE
909  );
910  }
911 
912  bool memoryIOTrafficMetricAvailable() const
913  {
914  return (
915  cpu_model == PCM::SANDY_BRIDGE
916  || cpu_model == PCM::IVY_BRIDGE
917  || cpu_model == PCM::HASWELL
918  || cpu_model == PCM::BROADWELL
919  );
920  }
921 
922  bool hasBecktonUncore() const
923  {
924  return (
925  cpu_model == PCM::NEHALEM_EX
926  || cpu_model == PCM::WESTMERE_EX
927  );
928  }
929  bool hasPCICFGUncore() const // has PCICFG uncore PMON
930  {
931  return (
932  cpu_model == PCM::JAKETOWN
933  || cpu_model == PCM::IVYTOWN
934  || cpu_model == PCM::HASWELLX
935  );
936  }
937 
938  ~PCM();
939 };
940 
945 {
946  friend class PCM;
947  template <class CounterStateType>
948  friend double getExecUsage(const CounterStateType & before, const CounterStateType & after);
949  template <class CounterStateType>
950  friend double getIPC(const CounterStateType & before, const CounterStateType & after);
951  template <class CounterStateType>
952  friend double getAverageFrequency(const CounterStateType & before, const CounterStateType & after);
953  template <class CounterStateType>
954  friend double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after);
955  template <class CounterStateType>
956  friend double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
957  template <class CounterStateType>
958  friend double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
959  template <class CounterStateType>
960  friend double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
961  template <class CounterStateType>
962  friend double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
963  template <class CounterStateType>
964  friend double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
965  template <class CounterStateType>
966  friend double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
967  template <class CounterStateType>
968  friend uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
969  template <class CounterStateType>
970  friend uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
971  template <class CounterStateType>
972  friend uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after);
973  template <class CounterStateType>
974  friend uint64 getL3CacheOccupancy(const CounterStateType & now);
975  template <class CounterStateType>
976  friend uint64 getCycles(const CounterStateType & before, const CounterStateType & after);
977  template <class CounterStateType>
978  friend uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after);
979  template <class CounterStateType>
980  friend uint64 getCycles(const CounterStateType & now);
981  template <class CounterStateType>
982  friend uint64 getInstructionsRetired(const CounterStateType & now);
983  template <class CounterStateType>
984  friend uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after);
985  template <class CounterStateType>
986  friend uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after);
987  template <class CounterStateType>
988  friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
989  template <class CounterStateType>
990  friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after);
991  template <class CounterStateType>
992  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
993  template <class CounterStateType>
994  friend uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after);
995  template <class CounterStateType>
996  friend double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
997 protected:
998  uint64 InstRetiredAny;
999  uint64 CpuClkUnhaltedThread;
1000  uint64 CpuClkUnhaltedRef;
1001  // dont put any additional fields between Event 0-Event 3 because getNumberOfCustomEvents assumes there are none
1002  union {
1003  uint64 L3Miss;
1004  uint64 Event0;
1005  uint64 ArchLLCMiss;
1006  };
1007  union {
1008  uint64 L3UnsharedHit;
1009  uint64 Event1;
1010  uint64 ArchLLCRef;
1011  };
1012  union {
1013  uint64 L2HitM;
1014  uint64 Event2;
1015  };
1016  union {
1017  uint64 L2Hit;
1018  uint64 Event3;
1019  };
1020  uint64 InvariantTSC; // invariant time stamp counter
1021  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
1022  int32 ThermalHeadroom;
1023  uint64 L3Occupancy;
1024  void readAndAggregate(SafeMsrHandle *);
1025 public:
1026  BasicCounterState() :
1027  InstRetiredAny(0)
1028  , CpuClkUnhaltedThread(0)
1029  , CpuClkUnhaltedRef(0)
1030  , L3Miss(0)
1031  , L3UnsharedHit(0)
1032  , L2HitM(0)
1033  , L2Hit(0)
1034  , InvariantTSC(0)
1035  , ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM)
1036  , L3Occupancy(0)
1037  {
1038  memset(&(CStateResidency[0]), 0, sizeof(CStateResidency));
1039  }
1040  virtual ~BasicCounterState() { }
1041 
1042  BasicCounterState & operator += (const BasicCounterState & o)
1043  {
1044  InstRetiredAny += o.InstRetiredAny;
1045  CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
1046  CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
1047  Event0 += o.Event0;
1048  Event1 += o.Event1;
1049  Event2 += o.Event2;
1050  Event3 += o.Event3;
1051  InvariantTSC += o.InvariantTSC;
1052  for(int i=0; i <= PCM::MAX_C_STATE ;++i)
1053  CStateResidency[i] += o.CStateResidency[i];
1054  // ThermalHeadroom is not accumulative
1055  L3Occupancy += o.L3Occupancy;
1056  return *this;
1057  }
1058 
1060  int32 getThermalHeadroom() const { return ThermalHeadroom; }
1061 };
1062 
1068 template <class CounterStateType>
1069 uint64 getQPIClocks(uint32 port, const CounterStateType & before, const CounterStateType & after)
1070 {
1071  return after.QPIClocks[port] - before.QPIClocks[port];
1072 }
1073 
1074 
1075 template <class CounterStateType>
1076 int32 getThermalHeadroom(const CounterStateType & /* before */, const CounterStateType & after)
1077 {
1078  return after.getThermalHeadroom();
1079 }
1080 
1086 template <class CounterStateType>
1087 uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1088 {
1089  return after.QPIL0pTxCycles[port] - before.QPIL0pTxCycles[port];
1090 }
1091 
1097 template <class CounterStateType>
1098 uint64 getQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1099 {
1100  return after.QPIL1Cycles[port] - before.QPIL1Cycles[port];
1101 }
1102 
1109 template <class CounterStateType>
1110 double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1111 {
1112  return double(getQPIL0pTxCycles(port,before,after))/double(getQPIClocks(port,before,after));
1113 }
1114 
1121 template <class CounterStateType>
1122 double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1123 {
1124  return double(getQPIL1Cycles(port,before,after))/double(getQPIClocks(port,before,after));
1125 }
1126 
1132 template <class CounterStateType>
1133 uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
1134 {
1135  return after.DRAMClocks[channel] - before.DRAMClocks[channel];
1136 }
1137 
1144 template <class CounterStateType>
1145 uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
1146 {
1147  return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
1148 }
1149 
1155 template <class CounterStateType>
1156 uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after)
1157 {
1158  return after.PCUCounter[counter] - before.PCUCounter[counter];
1159 }
1160 
1165 template <class CounterStateType>
1166 uint64 getPCUClocks(const CounterStateType & before, const CounterStateType & after)
1167 {
1168  return getPCUCounter(0,before,after);
1169 }
1170 
1175 template <class CounterStateType>
1176 uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
1177 {
1178  return after.PackageEnergyStatus - before.PackageEnergyStatus;
1179 }
1180 
1185 template <class CounterStateType>
1186 uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
1187 {
1188  return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
1189 }
1190 
1195 template <class CounterStateType>
1196 double getConsumedJoules(const CounterStateType & before, const CounterStateType & after)
1197 {
1198  PCM * m = PCM::getInstance();
1199  if(!m) return -1.;
1200 
1201  return double(getConsumedEnergy(before,after))*m->getJoulesPerEnergyUnit();
1202 }
1203 
1208 template <class CounterStateType>
1209 double getDRAMConsumedJoules(const CounterStateType & before, const CounterStateType & after)
1210 {
1211  PCM * m = PCM::getInstance();
1212  if(!m) return -1.;
1213  double dram_joules_per_energy_unit;
1214 
1215  if(PCM::HASWELLX == m->getCPUModel()) {
1216 /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of
1217  * Volume 2 (Registers) of
1218  * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014)
1219  * ENERGY_UNIT for DRAM domain is fixed to 15.3 uJ for server Haswell processors.
1220  */
1221  dram_joules_per_energy_unit=0.0000153;
1222  } else {
1223 /* for all other processors (including Haswell client/mobile SKUs) the ENERGY_UNIT for DRAM domain
1224  * should be read from PACKAGE_POWER_SKU register (usually value around ~61uJ)
1225  */
1226  dram_joules_per_energy_unit=m->getJoulesPerEnergyUnit();
1227  }
1228  return double(getDRAMConsumedEnergy(before,after))*dram_joules_per_energy_unit;
1229 }
1230 
1231 
1236 {
1237  friend class PCM;
1238  template <class CounterStateType>
1239  friend uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after);
1240  template <class CounterStateType>
1241  friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after);
1242  template <class CounterStateType>
1243  friend uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
1244  template <class CounterStateType>
1245  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
1246  template <class CounterStateType>
1247  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
1248  template <class CounterStateType>
1249  friend double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
1250 protected:
1251  uint64 UncMCFullWrites;
1252  uint64 UncMCNormalReads;
1253  uint64 UncMCIORequests;
1254  uint64 PackageEnergyStatus;
1255  uint64 DRAMEnergyStatus;
1256  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
1257  void readAndAggregate(SafeMsrHandle *);
1258 public:
1259  UncoreCounterState() :
1260  UncMCFullWrites(0)
1261  , UncMCNormalReads(0)
1262  , UncMCIORequests(0)
1263  , PackageEnergyStatus(0)
1264  , DRAMEnergyStatus(0)
1265  {
1266  memset(&(CStateResidency[0]), 0, sizeof(CStateResidency));
1267  }
1268  virtual ~UncoreCounterState() { }
1269 
1270  UncoreCounterState & operator += (const UncoreCounterState & o)
1271  {
1272  UncMCFullWrites += o.UncMCFullWrites;
1273  UncMCNormalReads += o.UncMCNormalReads;
1274  UncMCIORequests += o.UncMCIORequests;
1275  PackageEnergyStatus += o.PackageEnergyStatus;
1276  DRAMEnergyStatus += o.DRAMEnergyStatus;
1277  for(int i=0; i <= PCM::MAX_C_STATE ;++i)
1278  CStateResidency[i] += o.CStateResidency[i];
1279  return *this;
1280  }
1281 };
1282 
1283 
1287 {
1288  uint64 QPIClocks[3], QPIL0pTxCycles[3], QPIL1Cycles[3];
1289  uint64 DRAMClocks[8];
1290  uint64 MCCounter[8][4];// channel X counter
1291  uint64 PCUCounter[4];
1292  int32 PackageThermalHeadroom;
1293  uint64 InvariantTSC; // invariant time stamp counter
1294  friend class PCM;
1295  template <class CounterStateType>
1296  friend uint64 getQPIClocks(uint32 port, const CounterStateType & before, const CounterStateType & after);
1297  template <class CounterStateType>
1298  friend uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after);
1299  template <class CounterStateType>
1300  friend uint64 getQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after);
1301  template <class CounterStateType>
1302  friend uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
1303  template <class CounterStateType>
1304  friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
1305  template <class CounterStateType>
1306  friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after);
1307  template <class CounterStateType>
1308  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
1309  template <class CounterStateType>
1310  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
1311  template <class CounterStateType>
1312  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
1313 public:
1315  int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; }
1317  PackageThermalHeadroom(0)
1318  , InvariantTSC(0)
1319  {
1320  memset(&(QPIClocks[0]), 0, 3*sizeof(uint64));
1321  memset(&(QPIL0pTxCycles[0]), 0, 3*sizeof(uint64));
1322  memset(&(QPIL1Cycles[0]), 0, 3*sizeof(uint64));
1323  memset(&(DRAMClocks[0]), 0, 8*sizeof(uint64));
1324  memset(&(PCUCounter[0]), 0, 4*sizeof(uint64));
1325  for(int i=0;i<8;++i)
1326  memset(&(MCCounter[i][0]), 0, 4*sizeof(uint64));
1327  }
1328 };
1329 
1332 {
1333  friend class PCM;
1334 
1335 public:
1336 };
1337 
1340 {
1341  friend class PCM;
1342 
1343 protected:
1344  void readAndAggregate(SafeMsrHandle * handle)
1345  {
1346  BasicCounterState::readAndAggregate(handle);
1347  UncoreCounterState::readAndAggregate(handle);
1348  }
1349 
1350 public:
1351  void accumulateCoreState(const CoreCounterState & o)
1352  {
1353  BasicCounterState::operator += (o);
1354  }
1355 };
1356 
1359 {
1360  friend class PCM;
1361  std::vector<std::vector<uint64> > incomingQPIPackets;
1362  std::vector<std::vector<uint64> > outgoingQPIIdleFlits;
1363  std::vector<std::vector<uint64> > outgoingQPIDataNonDataFlits;
1364  uint64 uncoreTSC;
1365 
1366 protected:
1367  void readAndAggregate(SafeMsrHandle * handle)
1368  {
1369  BasicCounterState::readAndAggregate(handle);
1370  UncoreCounterState::readAndAggregate(handle);
1371  }
1372 
1373 public:
1374  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
1375  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
1376  friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
1377  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
1378  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
1379  SystemCounterState() :
1380  uncoreTSC(0)
1381  {
1382  PCM * m = PCM::getInstance();
1383  incomingQPIPackets.resize(m->getNumSockets(),
1384  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
1385  outgoingQPIIdleFlits.resize(m->getNumSockets(),
1386  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
1387  outgoingQPIDataNonDataFlits.resize(m->getNumSockets(),
1388  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
1389  }
1390 
1391  void accumulateSocketState(const SocketCounterState & o)
1392  {
1393  if (&o != NULL) // security check requirement
1394  {
1395  BasicCounterState::operator += (o);
1396  UncoreCounterState::operator += (o);
1397  }
1398  }
1399 };
1400 
1411 
1419 INTELPCM_API SocketCounterState getSocketCounterState(uint32 socket);
1420 
1428 INTELPCM_API CoreCounterState getCoreCounterState(uint32 core);
1429 
1430 
1437 template <class CounterStateType>
1438 double getIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
1439 {
1440  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1441  if (clocks != 0)
1442  return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
1443  return -1;
1444 }
1445 
1446 
1453 template <class CounterStateType>
1454 uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after) // instructions
1455 {
1456  return after.InstRetiredAny - before.InstRetiredAny;
1457 }
1458 
1465 template <class CounterStateType>
1466 double getExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
1467 {
1468  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
1469  if (timer_clocks != 0)
1470  return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
1471  return -1;
1472 }
1473 
1479 template <class CounterStateType>
1480 uint64 getInstructionsRetired(const CounterStateType & now) // instructions
1481 {
1482  return now.InstRetiredAny;
1483 }
1484 
1502 template <class CounterStateType>
1503 uint64 getCycles(const CounterStateType & before, const CounterStateType & after) // clocks
1504 {
1505  return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1506 }
1507 
1518 template <class CounterStateType>
1519 uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after) // clocks
1520 {
1521  return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
1522 }
1523 
1531 template <class CounterStateType>
1532 uint64 getCycles(const CounterStateType & now) // clocks
1533 {
1534  return now.CpuClkUnhaltedThread;
1535 }
1536 
1545 inline double getCoreIPC(const SystemCounterState & before, const SystemCounterState & after) // instructions per cycle
1546 {
1547  double ipc = getIPC(before, after);
1548  PCM * m = PCM::getInstance();
1549  if (ipc >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
1550  return ipc * double(m->getThreadsPerCore());
1551  return -1;
1552 }
1553 
1554 
1555 
1556 
1565 inline double getTotalExecUsage(const SystemCounterState & before, const SystemCounterState & after) // usage
1566 {
1567  double usage = getExecUsage(before, after);
1568  PCM * m = PCM::getInstance();
1569  if (usage >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
1570  return usage * double(m->getThreadsPerCore());
1571  return -1;
1572 }
1573 
1580 template <class CounterStateType>
1581 double getAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
1582 {
1583  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1584  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
1585  PCM * m = PCM::getInstance();
1586  if (timer_clocks != 0 && m)
1587  return double(m->getNominalFrequency()) * double(clocks) / double(timer_clocks);
1588  return -1;
1589 }
1590 
1597 template <class CounterStateType>
1598 double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
1599 {
1600  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1601  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
1602  PCM * m = PCM::getInstance();
1603  if (ref_clocks != 0 && m)
1604  return double(m->getNominalFrequency()) * double(clocks) / double(ref_clocks);
1605  return -1;
1606 }
1607 
1614 template <class CounterStateType>
1615 double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
1616 {
1617  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1618  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
1619  if (timer_clocks != 0)
1620  return double(clocks) / double(timer_clocks);
1621  return -1;
1622 }
1623 
1630 template <class CounterStateType>
1631 double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
1632 {
1633  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1634  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
1635  if (ref_clocks != 0)
1636  return double(clocks) / double(ref_clocks);
1637  return -1;
1638 }
1639 
1647 template <class CounterStateType>
1648 double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
1649 {
1650  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return -1;
1651  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1652  if (clocks != 0)
1653  {
1654  return 180. * double(after.L3Miss - before.L3Miss) / double(clocks);
1655  }
1656  return -1;
1657 }
1658 
1667 template <class CounterStateType>
1668 double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
1669 {
1670  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return -1;
1671  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
1672  if (clocks != 0)
1673  {
1674  double L3UnsharedHit = (double)(after.L3UnsharedHit - before.L3UnsharedHit);
1675  double L2HitM = (double)(after.L2HitM - before.L2HitM);
1676  return (35. * L3UnsharedHit + 74. * L2HitM) / double(clocks);
1677  }
1678  return -1;
1679 }
1680 
1688 template <class CounterStateType>
1689 double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
1690 {
1691  if (PCM::getInstance()->getCPUModel() == PCM::ATOM)
1692  {
1693  uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
1694  uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
1695  if (L2Ref) return 1. - (double(L2Miss) / double(L2Ref));
1696  return 1;
1697  }
1698  uint64 L3Miss = after.L3Miss - before.L3Miss;
1699  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
1700  uint64 L2HitM = after.L2HitM - before.L2HitM;
1701  uint64 L2Hit = after.L2Hit - before.L2Hit;
1702  uint64 hits = L2Hit;
1703  uint64 all = L2Hit + L2HitM + L3UnsharedHit + L3Miss;
1704  if (all) return double(hits) / double(all);
1705 
1706  return 1;
1707 }
1708 
1716 template <class CounterStateType>
1717 double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
1718 {
1719  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return -1;
1720 
1721  uint64 L3Miss = after.L3Miss - before.L3Miss;
1722  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
1723  uint64 L2HitM = after.L2HitM - before.L2HitM;
1724  uint64 hits = L3UnsharedHit + L2HitM;
1725  uint64 all = L2HitM + L3UnsharedHit + L3Miss;
1726  if (all) return double(hits) / double(all);
1727 
1728  return 1;
1729 }
1730 
1738 template <class CounterStateType>
1739 uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after)
1740 {
1741  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0;
1742  return after.L3Miss - before.L3Miss;
1743 }
1744 
1752 template <class CounterStateType>
1753 uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after)
1754 {
1755  if (PCM::getInstance()->getCPUModel() == PCM::ATOM)
1756  {
1757  return after.ArchLLCMiss - before.ArchLLCMiss;
1758  }
1759  uint64 L3Miss = after.L3Miss - before.L3Miss;
1760  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
1761  uint64 L2HitM = after.L2HitM - before.L2HitM;
1762  return L2HitM + L3UnsharedHit + L3Miss;
1763 }
1764 
1772 template <class CounterStateType>
1773 uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after)
1774 {
1775  if (PCM::getInstance()->getCPUModel() == PCM::ATOM)
1776  {
1777  uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
1778  uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
1779  return L2Ref - L2Miss;
1780  }
1781  return after.L2Hit - before.L2Hit;
1782 }
1783 
1787 template <class CounterStateType>
1788 uint64 getL3CacheOccupancy(const CounterStateType & now)
1789 {
1790 
1791  return now.L3Occupancy ;
1792 
1793 }
1794 
1802 template <class CounterStateType>
1803 uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after)
1804 {
1805  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0;
1806  return after.L3UnsharedHit - before.L3UnsharedHit;
1807 }
1808 
1816 template <class CounterStateType>
1817 uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after)
1818 {
1819  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0;
1820  return after.L2HitM - before.L2HitM;
1821 }
1822 
1823 
1831 template <class CounterStateType>
1832 uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after)
1833 {
1834  if (PCM::getInstance()->getCPUModel() == PCM::ATOM) return 0;
1835  return getL3CacheHitsSnoop(before, after) + getL3CacheHitsNoSnoop(before, after);
1836 }
1837 
1846 template <class CounterStateType>
1847 uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after)
1848 {
1849  return after.InvariantTSC - before.InvariantTSC;
1850 }
1851 
1859 template <class CounterStateType>
1860 inline double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
1861 {
1862  const double tsc = double(getInvariantTSC(before,after));
1863 
1864  if(state == 0) return double(getRefCycles(before,after))/tsc;
1865 
1866  if(state == 1)
1867  {
1868  PCM * m = PCM::getInstance();
1869  double result = 1.0 - double(getRefCycles(before,after))/tsc; // 1.0 - cC0
1870  for(int i = 2; i <= PCM::MAX_C_STATE; ++i)
1871  if(m->isCoreCStateResidencySupported(state))
1872  result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i])/tsc;
1873 
1874  if(result < 0.) result = 0.; // fix counter dissynchronization
1875  else if(result > 1.) result = 1.; // fix counter dissynchronization
1876 
1877  return result;
1878  }
1879  return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state])/tsc;
1880 }
1881 
1889 template <class CounterStateType>
1890 inline double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
1891 {
1892  return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state])/double(getInvariantTSC(before,after));
1893 }
1894 
1895 
1902 template <class CounterStateType>
1903 uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after)
1904 {
1905  return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
1906 }
1907 
1914 template <class CounterStateType>
1915 uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after)
1916 {
1917  return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
1918 }
1919 
1926 template <class CounterStateType>
1927 uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
1928 {
1929  return (after.UncMCIORequests - before.UncMCIORequests) * 64;
1930 }
1931 
1941 template <class CounterStateType>
1942 uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after)
1943 {
1944  return ((&after.Event0)[eventCounterNr] - (&before.Event0)[eventCounterNr]);
1945 }
1946 
1957 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
1958 {
1959  uint64 b = before.incomingQPIPackets[socketNr][linkNr];
1960  uint64 a = after.incomingQPIPackets[socketNr][linkNr];
1961  // prevent overflows due to counter dissynchronisation
1962  return (a > b) ? (64 * (a - b)) : 0;
1963 }
1964 
1975 inline double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
1976 {
1977  PCM * m = PCM::getInstance();
1978  if (!(m->qpiUtilizationMetricsAvailable())) return 0.;
1979 
1980  const double bytes = (double)getIncomingQPILinkBytes(socketNr, linkNr, before, after);
1981  const uint64 max_speed = m->getQPILinkSpeed(socketNr, linkNr);
1982  const double max_bytes = (double)(double(max_speed) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
1983  return bytes / max_bytes;
1984 }
1985 
1996 inline double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
1997 {
1998  PCM * m = PCM::getInstance();
1999 
2000  if(m->hasBecktonUncore())
2001  {
2002  const uint64 b = before.outgoingQPIIdleFlits[socketNr][linkNr];
2003  const uint64 a = after.outgoingQPIIdleFlits[socketNr][linkNr];
2004  // prevent overflows due to counter dissynchronisation
2005  const double idle_flits = (double)((a > b) ? (a - b) : 0);
2006  const uint64 bTSC = before.uncoreTSC;
2007  const uint64 aTSC = after.uncoreTSC;
2008  const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
2009  if(idle_flits > tsc) return 0.; // prevent oveflows due to potential counter dissynchronization
2010 
2011  return (1. - (idle_flits / tsc));
2012  } else if(m->hasPCICFGUncore())
2013  {
2014  const uint64 b = before.outgoingQPIDataNonDataFlits[socketNr][linkNr];
2015  const uint64 a = after.outgoingQPIDataNonDataFlits[socketNr][linkNr];
2016  // prevent overflows due to counter dissynchronisation
2017  const double flits = (double)((a > b) ? (a - b) : 0);
2018  const double max_flits = ((double(getInvariantTSC(before, after))*double(m->getQPILinkSpeed(socketNr, linkNr))/(2.0*4.0))/double(m->getNominalFrequency()))/double(m->getNumCores());
2019  if(flits > max_flits) return 1.; // prevent oveflows due to potential counter dissynchronization
2020  return (flits / max_flits);
2021  }
2022 
2023  return 0;
2024 }
2025 
2036 inline uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
2037 {
2038  PCM * m = PCM::getInstance();
2039  if (!(m->outgoingQPITrafficMetricsAvailable())) return 0;
2040 
2041  const double util = getOutgoingQPILinkUtilization(socketNr, linkNr, before, after);
2042  const double max_bytes = (double(m->getQPILinkSpeed(socketNr, linkNr)) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
2043 
2044  return (uint64)(max_bytes * util);
2045 }
2046 
2047 
2056 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
2057 {
2058  PCM * m = PCM::getInstance();
2059  const uint32 ns = m->getNumSockets();
2060  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
2061  uint64 sum = 0;
2062 
2063  for (uint32 s = 0; s < ns; ++s)
2064  for (uint32 q = 0; q < qpiLinks; ++q)
2065  sum += getIncomingQPILinkBytes(s, q, before, after);
2066 
2067  return sum;
2068 }
2069 
2078 inline uint64 getAllOutgoingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
2079 {
2080  PCM * m = PCM::getInstance();
2081  const uint32 ns = m->getNumSockets();
2082  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
2083  uint64 sum = 0;
2084 
2085  for (uint32 s = 0; s < ns; ++s)
2086  for (uint32 q = 0; q < qpiLinks; ++q)
2087  sum += getOutgoingQPILinkBytes(s, q, before, after);
2088 
2089  return sum;
2090 }
2091 
2092 
2102 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now)
2103 {
2104  return 64 * now.incomingQPIPackets[socketNr][linkNr];
2105 }
2106 
2115 inline uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState & now)
2116 {
2117  PCM * m = PCM::getInstance();
2118  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
2119  uint64 sum = 0;
2120 
2121  for (uint32 q = 0; q < qpiLinks; ++q)
2122  sum += getIncomingQPILinkBytes(socketNr, q, now);
2123 
2124  return sum;
2125 }
2126 
2135 {
2136  PCM * m = PCM::getInstance();
2137  const uint32 ns = m->getNumSockets();
2138  uint64 sum = 0;
2139 
2140  for (uint32 s = 0; s < ns; ++s)
2141  sum += getSocketIncomingQPILinkBytes(s, now);
2142  return sum;
2143 }
2144 
2145 
2155 inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after)
2156 {
2157  const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after);
2158  const uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after);
2159  return double(totalQPI) / double(memTraffic);
2160 }
2161 
2166 {
2167  return after.data - before.data;
2168 }
2169 
2170 #endif
double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1122
uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState &now)
Get estimation of total QPI data traffic for this socket.
Definition: cpucounters.h:2115
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:1847
uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1069
friend uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:1915
Definition: types.h:283
friend uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:1903
uint64 getAllOutgoingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data+nondata traffic.
Definition: cpucounters.h:2078
void cleanup()
Cleanups resources and stops performance counting.
Definition: cpucounters.cpp:2474
Internal type and constant definitions.
friend uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:1156
uint32 getNumQPIPorts() const
Returns the number of detected QPI ports.
Definition: cpucounters.h:162
uint64 getQPILinkSpeed(const uint32 linkNr) const
Returns the speed of the QPI link.
Definition: cpucounters.h:165
uint64 getNumberOfEvents(PCIeCounterState before, PCIeCounterState after)
Returns the raw count of PCIe events.
Definition: cpucounters.h:2165
bool good()
Checks the status of PCM object.
Definition: cpucounters.cpp:1491
Definition: pci.h:83
void setRunState(int new_state)
Set Run State.
Definition: cpucounters.h:297
Socket-wide counter state.
Definition: cpucounters.h:1339
friend uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:1803
uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:1739
void freezeServerUncoreCounters()
Freezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown) ...
Definition: cpucounters.cpp:2974
Interface to access client bandwidth counters.
uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:1957
friend uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:1817
Definition: types.h:316
void resetPMU()
Forces PMU reset.
Definition: cpucounters.cpp:2395
INTELPCM_API SocketCounterState getSocketCounterState(uint32 socket)
Reads the counter state of a socket.
Definition: cpucounters.cpp:2665
SupportedCPUModels
Identifiers of supported CPU models.
Definition: cpucounters.h:630
double getCoreIPC(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per core cycle for the entire system combining instru...
Definition: cpucounters.h:1545
uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:1832
Definition: cpucounters.h:319
uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:1817
Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP...
Definition: cpucounters.h:80
uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:1186
friend double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:1438
void reportQPISpeed() const
Print QPI Speeds.
Definition: cpucounters.h:170
Definition: cpucounters.h:70
unsigned getMaxRMID() const
returns the max number of RMID supported by socket
Definition: cpucounters.cpp:423
uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:1788
Definition: cpucounters.h:318
friend double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:1631
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:1176
friend double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:1668
friend double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:1689
uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
Return QPI Link Speed in GBytes/second.
Definition: cpucounters.h:790
ProgramMode
Mode of programming (parameter in the program() method)
Definition: cpucounters.h:315
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:1847
double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:1689
double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:1466
uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1098
uint32 getMCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:716
int32 getThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:1060
double getConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by processor (excluding DRAM)
Definition: cpucounters.h:1196
double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:1648
uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel)
Measures/computes the maximum theoretical QPI link bandwidth speed in GByte/seconds.
Definition: cpucounters.cpp:4183
uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:1176
double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:1631
System-wide counter state.
Definition: cpucounters.h:1358
friend double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:1717
PCIeCounterState getPCIeCounterState(const uint32 socket_)
Get the state of PCIe counter(s)
Definition: cpucounters.cpp:4392
uint64 getTickCount(uint64 multiplier=1000, uint32 core=0)
Return TSC timer value in time units.
Definition: cpucounters.cpp:2606
uint32 getL3ScalingFactor()
runs CPUID.0xF.0x01 to get the L3 up scaling factor to calculate L3 Occupancy Scaling factor is retur...
Definition: cpucounters.cpp:3333
friend double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:1860
double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1110
bool L3CacheOccupancyMetricAvailable()
checks if cache monitoring present
Definition: cpucounters.cpp:416
uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1087
friend uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1087
double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:1860
INTELPCM_API CoreCounterState getCoreCounterState(uint32 core)
Reads the counter state of a (logical) core.
Definition: cpucounters.cpp:2673
void unfreezeServerUncoreCounters()
Unfreezes uncore event counting (works only on microarchitecture codename SandyBridge-EP and IvyTown)...
Definition: cpucounters.cpp:2982
uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1133
ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int *freq_bands=NULL)
Programs uncore power/energy counters on microarchitectures codename SandyBridge-EP and IvyTown...
Definition: cpucounters.cpp:2806
friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:1996
uint32 getCPUModel()
Reads CPU model id.
Definition: cpucounters.h:657
friend double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:1581
Custom Core event description.
Definition: cpucounters.h:337
double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:1890
uint64 getAllIncomingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data traffic.
Definition: cpucounters.h:2056
INTELPCM_API SystemCounterState getSystemCounterState()
Reads the counter state of the system.
Definition: cpucounters.cpp:2657
void program()
Program performance counters (disables programming power counters)
Definition: cpucounters.cpp:3760
uint64 getQPIL0pTxCycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving half-lane mode.
Definition: cpucounters.cpp:4062
Basic uncore counter state.
Definition: cpucounters.h:1235
void enableJKTWorkaround(bool enable)
Enable correct counting of various LLC events (with memory access perf penalty)
Definition: cpucounters.cpp:4148
uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:1915
Extended custom core event description.
Definition: cpucounters.h:351
int32 getPackageThermalSpecPower() const
Returns thermal specification power of the package domain in Watt.
Definition: cpucounters.h:797
friend uint64 getNumberOfEvents(PCIeCounterState before, PCIeCounterState after)
Returns the raw count of PCIe events.
Definition: cpucounters.h:2165
double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:1598
double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:1615
uint64 getTickCountRDTSCP(uint64 multiplier=1000)
Return TSC timer value in time units using rdtscp instruction from current core.
Definition: cpucounters.cpp:2652
uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:1145
uint64 getQPILLCounter(uint32 port, uint32 counter)
Direct read of QPI LL PMU counter (counter meaning depends on the programming: power/performance/etc)...
Definition: cpucounters.cpp:4122
uint64 getQPILinksPerSocket() const
Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket.
Definition: cpucounters.h:673
friend double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:1466
Definition: cpucounters.h:185
double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:1717
void programPCIeCounters(const PCIeEventCode event_, const uint32 tid_=0, const uint32 miss_=0)
Program uncore PCIe monitoring event(s)
Definition: cpucounters.cpp:4350
double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:1581
uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:1927
uint32 getNumSockets()
Reads number of sockets (CPUs) in the system.
Definition: cpucounters.cpp:3313
uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:1803
int getRunState(void)
Returns program's Run State.
Definition: cpucounters.h:303
uint32 getNumOnlineCores()
Reads number of online logical cores in the system.
Definition: cpucounters.cpp:3308
uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:2036
bool getSMT()
Checks if SMT (HyperThreading) is enabled.
Definition: cpucounters.cpp:3323
void unfreezeCounters()
Unfreezes event counting.
Definition: cpucounters.cpp:4038
friend uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:1519
friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:1145
uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:1847
Definition: cpucounters.h:316
Low level interface to access PCI configuration space.
friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:1957
friend uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:1753
uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:1519
friend uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:1773
uint64 getPCUClocks(const CounterStateType &before, const CounterStateType &after)
Returns clock ticks of power control unit.
Definition: cpucounters.h:1166
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:1186
uint32 getOriginalCPUModel()
Reads original CPU model id.
Definition: cpucounters.h:661
Provides 64-bit "virtual" counters from underlying 32-bit HW counters.
void setOutput(const std::string filename)
Redirects output destination to provided file, instead of std::cout.
Definition: cpucounters.cpp:2456
void freezeCounters()
Freezes event counting.
Definition: cpucounters.cpp:4026
static bool initWinRing0Lib()
Loads and initializes Winring0 third party library for access to processor model specific and PCI con...
int32 getSocketId(uint32 core_id)
Determines socket of given core.
Definition: cpucounters.h:666
friend double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:1615
void allowMultipleInstances()
Call it before program() to allow multiple running instances of PCM on the same system.
Definition: cpucounters.h:309
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:1176
int32 getPackageMaximumPower() const
Returns maximum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:803
friend uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:1788
friend uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1133
void program_power_metrics(int mc_profile)
Program power counters (disables programming performance counters)
Definition: cpucounters.cpp:3909
uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:1503
void getAllCounterStates(SystemCounterState &systemState, std::vector< SocketCounterState > &socketStates, std::vector< CoreCounterState > &coreStates)
Reads all counter states (including system, sockets and cores)
Definition: cpucounters.cpp:3254
friend double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:1890
uint64 getMCCounter(uint32 channel, uint32 counter)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.cpp:4096
double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get data utilization of incoming QPI link (0..1)
Definition: cpucounters.h:1975
uint64 getQPIClocks(uint32 port)
Get number of QPI LL clocks on a QPI port.
Definition: cpucounters.cpp:4050
const char * getUArchCodename(int32 cpu_model_=-1) const
Get a string describing the codename of the processor microarchitecture.
Definition: cpucounters.cpp:2326
uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:1942
static std::string getCPUBrandString()
Get Brand string of processor.
Definition: cpucounters.cpp:2176
Definition: width_extender.h:39
friend uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1069
double getDRAMConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by DRAM.
Definition: cpucounters.h:1209
uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:1454
uint32 getNumCores()
Reads number of logical cores in the system.
Definition: cpucounters.cpp:3303
CPU Performance Monitor.
Definition: cpucounters.h:212
int32 getPackageMinimumPower() const
Returns minimum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:800
friend uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:1927
friend uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:1739
friend double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:1648
uint64 getNominalFrequency()
Reads the nominal core frequency.
Definition: cpucounters.cpp:3328
double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:1996
uint64 getImcWrites()
Get the number of integrated controller writes (in cache lines)
Definition: cpucounters.cpp:3877
uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:1156
friend uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:1454
ErrorCode program(const ProgramMode mode_=DEFAULT_EVENTS, const void *parameter_=NULL)
Programs performance counters.
Definition: cpucounters.cpp:1556
uint64 getPCUFrequency() const
Returns the frequency of Power Control Unit.
Definition: cpucounters.h:761
uint32 getThreadsPerCore()
Reads how many hardware threads has a physical core "Hardware thread" is a logical core in a differen...
Definition: cpucounters.cpp:3318
double getJoulesPerEnergyUnit() const
Returns how many joules are in an internal processor energy unit.
Definition: cpucounters.h:794
void restoreOutput()
Restores output, closes output file if opened.
Definition: cpucounters.cpp:2463
Definition: client_bw.h:40
Definition: msr.h:62
uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:1753
const std::string & getErrorMessage() const
Returns the error message.
Definition: cpucounters.h:493
double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:1438
int32 getPackageThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:1315
bool isCoreCStateResidencySupported(int state)
Returns true if the specified core C-state residency metric is supported.
Definition: cpucounters.h:273
Server uncore power counter state.
Definition: cpucounters.h:1286
uint32 getNumMCChannels() const
Returns the total number of detected memory channels on all integrated memory controllers.
Definition: cpucounters.h:182
friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:2036
Definition: cpucounters.h:317
bool isPackageCStateResidencySupported(int state)
Returns true if the specified package C-state residency metric is supported.
Definition: cpucounters.h:282
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:1186
(Logical) core-wide counter state
Definition: cpucounters.h:1331
uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:1773
bool isCoreOnline(int32 os_core_id) const
Return true if the core in online.
Definition: cpucounters.cpp:1404
double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:1668
uint32 getMCPerSocket() const
Returns the number of detected integrated memory controllers per socket.
Definition: cpucounters.h:696
friend uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:1503
uint32 getNumMC() const
Returns the number of detected integrated memory controllers.
Definition: cpucounters.h:179
uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:1903
uint64 getOutgoingDataNonDataFlits(uint32 port)
Get the number of outgoing data and non-data flits from the socket through a port.
Definition: cpucounters.cpp:3904
friend double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:1598
uint64 getDRAMClocks(uint32 channel)
Get number DRAM channel cycles.
Definition: cpucounters.cpp:4086
uint64 getQPIL1Cycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving shutdown mode.
Definition: cpucounters.cpp:4074
double getTotalExecUsage(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per time intervall for the entire system combining in...
Definition: cpucounters.h:1565
uint32 getMaxIPC() const
Returns the max number of instructions per cycle.
Definition: cpucounters.h:737
friend uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:1832
friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:1942
Low level interface to access hardware model specific registers.
ServerUncorePowerState getServerUncorePowerState(uint32 socket)
Reads the power/energy counter state of a socket (works only on microarchitecture codename SandyBridg...
Definition: cpucounters.cpp:3342
ErrorCode
Return codes (e.g. for program(..) method)
Definition: cpucounters.h:323
uint64 getImcReads()
Get the number of integrated controller reads (in cache lines)
Definition: cpucounters.cpp:3863
static PCM * getInstance()
Returns PCM object.
Definition: cpucounters.cpp:196
Basic core counter state.
Definition: cpucounters.h:944
uint64 getIncomingDataFlits(uint32 port)
Get the number of incoming data flits to the socket through a port.
Definition: cpucounters.cpp:3891
double getQPItoMCTrafficRatio(const SystemCounterState &before, const SystemCounterState &after)
Get QPI data to Memory Controller traffic ratio.
Definition: cpucounters.h:2155
friend uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1098