mperf_monitor.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. /*
  2. * (C) 2010,2011 Thomas Renninger <trenn@suse.de>, Novell Inc.
  3. *
  4. * Licensed under the terms of the GNU GPL License version 2.
  5. */
  6. #if defined(__i386__) || defined(__x86_64__)
  7. #include <stdio.h>
  8. #include <stdint.h>
  9. #include <stdlib.h>
  10. #include <string.h>
  11. #include <limits.h>
  12. #include <cpufreq.h>
  13. #include "helpers/helpers.h"
  14. #include "idle_monitor/cpupower-monitor.h"
  15. #define MSR_APERF 0xE8
  16. #define MSR_MPERF 0xE7
  17. #define MSR_TSC 0x10
  18. #define MSR_AMD_HWCR 0xc0010015
  19. enum mperf_id { C0 = 0, Cx, AVG_FREQ, MPERF_CSTATE_COUNT };
  20. static int mperf_get_count_percent(unsigned int self_id, double *percent,
  21. unsigned int cpu);
  22. static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
  23. unsigned int cpu);
  24. static struct timespec time_start, time_end;
  25. static cstate_t mperf_cstates[MPERF_CSTATE_COUNT] = {
  26. {
  27. .name = "C0",
  28. .desc = N_("Processor Core not idle"),
  29. .id = C0,
  30. .range = RANGE_THREAD,
  31. .get_count_percent = mperf_get_count_percent,
  32. },
  33. {
  34. .name = "Cx",
  35. .desc = N_("Processor Core in an idle state"),
  36. .id = Cx,
  37. .range = RANGE_THREAD,
  38. .get_count_percent = mperf_get_count_percent,
  39. },
  40. {
  41. .name = "Freq",
  42. .desc = N_("Average Frequency (including boost) in MHz"),
  43. .id = AVG_FREQ,
  44. .range = RANGE_THREAD,
  45. .get_count = mperf_get_count_freq,
  46. },
  47. };
  48. enum MAX_FREQ_MODE { MAX_FREQ_SYSFS, MAX_FREQ_TSC_REF };
  49. static int max_freq_mode;
  50. /*
  51. * The max frequency mperf is ticking at (in C0), either retrieved via:
  52. * 1) calculated after measurements if we know TSC ticks at mperf/P0 frequency
  53. * 2) cpufreq /sys/devices/.../cpu0/cpufreq/cpuinfo_max_freq at init time
  54. * 1. Is preferred as it also works without cpufreq subsystem (e.g. on Xen)
  55. */
  56. static unsigned long max_frequency;
  57. static unsigned long long tsc_at_measure_start;
  58. static unsigned long long tsc_at_measure_end;
  59. static unsigned long long *mperf_previous_count;
  60. static unsigned long long *aperf_previous_count;
  61. static unsigned long long *mperf_current_count;
  62. static unsigned long long *aperf_current_count;
  63. /* valid flag for all CPUs. If a MSR read failed it will be zero */
  64. static int *is_valid;
  65. static int mperf_get_tsc(unsigned long long *tsc)
  66. {
  67. int ret;
  68. ret = read_msr(0, MSR_TSC, tsc);
  69. if (ret)
  70. dprint("Reading TSC MSR failed, returning %llu\n", *tsc);
  71. return ret;
  72. }
  73. static int mperf_init_stats(unsigned int cpu)
  74. {
  75. unsigned long long val;
  76. int ret;
  77. ret = read_msr(cpu, MSR_APERF, &val);
  78. aperf_previous_count[cpu] = val;
  79. ret |= read_msr(cpu, MSR_MPERF, &val);
  80. mperf_previous_count[cpu] = val;
  81. is_valid[cpu] = !ret;
  82. return 0;
  83. }
  84. static int mperf_measure_stats(unsigned int cpu)
  85. {
  86. unsigned long long val;
  87. int ret;
  88. ret = read_msr(cpu, MSR_APERF, &val);
  89. aperf_current_count[cpu] = val;
  90. ret |= read_msr(cpu, MSR_MPERF, &val);
  91. mperf_current_count[cpu] = val;
  92. is_valid[cpu] = !ret;
  93. return 0;
  94. }
  95. static int mperf_get_count_percent(unsigned int id, double *percent,
  96. unsigned int cpu)
  97. {
  98. unsigned long long aperf_diff, mperf_diff, tsc_diff;
  99. unsigned long long timediff;
  100. if (!is_valid[cpu])
  101. return -1;
  102. if (id != C0 && id != Cx)
  103. return -1;
  104. mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
  105. aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
  106. if (max_freq_mode == MAX_FREQ_TSC_REF) {
  107. tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
  108. *percent = 100.0 * mperf_diff / tsc_diff;
  109. dprint("%s: TSC Ref - mperf_diff: %llu, tsc_diff: %llu\n",
  110. mperf_cstates[id].name, mperf_diff, tsc_diff);
  111. } else if (max_freq_mode == MAX_FREQ_SYSFS) {
  112. timediff = max_frequency * timespec_diff_us(time_start, time_end);
  113. *percent = 100.0 * mperf_diff / timediff;
  114. dprint("%s: MAXFREQ - mperf_diff: %llu, time_diff: %llu\n",
  115. mperf_cstates[id].name, mperf_diff, timediff);
  116. } else
  117. return -1;
  118. if (id == Cx)
  119. *percent = 100.0 - *percent;
  120. dprint("%s: previous: %llu - current: %llu - (%u)\n",
  121. mperf_cstates[id].name, mperf_diff, aperf_diff, cpu);
  122. dprint("%s: %f\n", mperf_cstates[id].name, *percent);
  123. return 0;
  124. }
  125. static int mperf_get_count_freq(unsigned int id, unsigned long long *count,
  126. unsigned int cpu)
  127. {
  128. unsigned long long aperf_diff, mperf_diff, time_diff, tsc_diff;
  129. if (id != AVG_FREQ)
  130. return 1;
  131. if (!is_valid[cpu])
  132. return -1;
  133. mperf_diff = mperf_current_count[cpu] - mperf_previous_count[cpu];
  134. aperf_diff = aperf_current_count[cpu] - aperf_previous_count[cpu];
  135. if (max_freq_mode == MAX_FREQ_TSC_REF) {
  136. /* Calculate max_freq from TSC count */
  137. tsc_diff = tsc_at_measure_end - tsc_at_measure_start;
  138. time_diff = timespec_diff_us(time_start, time_end);
  139. max_frequency = tsc_diff / time_diff;
  140. }
  141. *count = max_frequency * ((double)aperf_diff / mperf_diff);
  142. dprint("%s: Average freq based on %s maximum frequency:\n",
  143. mperf_cstates[id].name,
  144. (max_freq_mode == MAX_FREQ_TSC_REF) ? "TSC calculated" : "sysfs read");
  145. dprint("max_frequency: %lu\n", max_frequency);
  146. dprint("aperf_diff: %llu\n", aperf_diff);
  147. dprint("mperf_diff: %llu\n", mperf_diff);
  148. dprint("avg freq: %llu\n", *count);
  149. return 0;
  150. }
  151. static int mperf_start(void)
  152. {
  153. int cpu;
  154. unsigned long long dbg;
  155. clock_gettime(CLOCK_REALTIME, &time_start);
  156. mperf_get_tsc(&tsc_at_measure_start);
  157. for (cpu = 0; cpu < cpu_count; cpu++)
  158. mperf_init_stats(cpu);
  159. mperf_get_tsc(&dbg);
  160. dprint("TSC diff: %llu\n", dbg - tsc_at_measure_start);
  161. return 0;
  162. }
  163. static int mperf_stop(void)
  164. {
  165. unsigned long long dbg;
  166. int cpu;
  167. for (cpu = 0; cpu < cpu_count; cpu++)
  168. mperf_measure_stats(cpu);
  169. mperf_get_tsc(&tsc_at_measure_end);
  170. clock_gettime(CLOCK_REALTIME, &time_end);
  171. mperf_get_tsc(&dbg);
  172. dprint("TSC diff: %llu\n", dbg - tsc_at_measure_end);
  173. return 0;
  174. }
  175. /*
  176. * Mperf register is defined to tick at P0 (maximum) frequency
  177. *
  178. * Instead of reading out P0 which can be tricky to read out from HW,
  179. * we use TSC counter if it reliably ticks at P0/mperf frequency.
  180. *
  181. * Still try to fall back to:
  182. * /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq
  183. * on older Intel HW without invariant TSC feature.
  184. * Or on AMD machines where TSC does not tick at P0 (do not exist yet, but
  185. * it's still double checked (MSR_AMD_HWCR)).
  186. *
  187. * On these machines the user would still get useful mperf
  188. * stats when acpi-cpufreq driver is loaded.
  189. */
  190. static int init_maxfreq_mode(void)
  191. {
  192. int ret;
  193. unsigned long long hwcr;
  194. unsigned long min;
  195. if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_INV_TSC))
  196. goto use_sysfs;
  197. if (cpupower_cpu_info.vendor == X86_VENDOR_AMD) {
  198. /* MSR_AMD_HWCR tells us whether TSC runs at P0/mperf
  199. * freq.
  200. * A test whether hwcr is accessable/available would be:
  201. * (cpupower_cpu_info.family > 0x10 ||
  202. * cpupower_cpu_info.family == 0x10 &&
  203. * cpupower_cpu_info.model >= 0x2))
  204. * This should be the case for all aperf/mperf
  205. * capable AMD machines and is therefore safe to test here.
  206. * Compare with Linus kernel git commit: acf01734b1747b1ec4
  207. */
  208. ret = read_msr(0, MSR_AMD_HWCR, &hwcr);
  209. /*
  210. * If the MSR read failed, assume a Xen system that did
  211. * not explicitly provide access to it and assume TSC works
  212. */
  213. if (ret != 0) {
  214. dprint("TSC read 0x%x failed - assume TSC working\n",
  215. MSR_AMD_HWCR);
  216. return 0;
  217. } else if (1 & (hwcr >> 24)) {
  218. max_freq_mode = MAX_FREQ_TSC_REF;
  219. return 0;
  220. } else { /* Use sysfs max frequency if available */ }
  221. } else if (cpupower_cpu_info.vendor == X86_VENDOR_INTEL) {
  222. /*
  223. * On Intel we assume mperf (in C0) is ticking at same
  224. * rate than TSC
  225. */
  226. max_freq_mode = MAX_FREQ_TSC_REF;
  227. return 0;
  228. }
  229. use_sysfs:
  230. if (cpufreq_get_hardware_limits(0, &min, &max_frequency)) {
  231. dprint("Cannot retrieve max freq from cpufreq kernel "
  232. "subsystem\n");
  233. return -1;
  234. }
  235. max_freq_mode = MAX_FREQ_SYSFS;
  236. max_frequency /= 1000; /* Default automatically to MHz value */
  237. return 0;
  238. }
  239. /*
  240. * This monitor provides:
  241. *
  242. * 1) Average frequency a CPU resided in
  243. * This always works if the CPU has aperf/mperf capabilities
  244. *
  245. * 2) C0 and Cx (any sleep state) time a CPU resided in
  246. * Works if mperf timer stops ticking in sleep states which
  247. * seem to be the case on all current HW.
  248. * Both is directly retrieved from HW registers and is independent
  249. * from kernel statistics.
  250. */
  251. struct cpuidle_monitor mperf_monitor;
  252. struct cpuidle_monitor *mperf_register(void)
  253. {
  254. if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_APERF))
  255. return NULL;
  256. if (init_maxfreq_mode())
  257. return NULL;
  258. /* Free this at program termination */
  259. is_valid = calloc(cpu_count, sizeof(int));
  260. mperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
  261. aperf_previous_count = calloc(cpu_count, sizeof(unsigned long long));
  262. mperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
  263. aperf_current_count = calloc(cpu_count, sizeof(unsigned long long));
  264. mperf_monitor.name_len = strlen(mperf_monitor.name);
  265. return &mperf_monitor;
  266. }
  267. void mperf_unregister(void)
  268. {
  269. free(mperf_previous_count);
  270. free(aperf_previous_count);
  271. free(mperf_current_count);
  272. free(aperf_current_count);
  273. free(is_valid);
  274. }
  275. struct cpuidle_monitor mperf_monitor = {
  276. .name = "Mperf",
  277. .hw_states_num = MPERF_CSTATE_COUNT,
  278. .hw_states = mperf_cstates,
  279. .start = mperf_start,
  280. .stop = mperf_stop,
  281. .do_register = mperf_register,
  282. .unregister = mperf_unregister,
  283. .needs_root = 1,
  284. .overflow_s = 922000000 /* 922337203 seconds TSC overflow
  285. at 20GHz */
  286. };
  287. #endif /* #if defined(__i386__) || defined(__x86_64__) */