三星 exynos

5410硬件结构体

下图是5410内部TMU（Thermal Management Unit）单元，图中TermalSensor是温度传感器，负责检测CPU的温度，该温度会和设定的温度报警门限进行比较，内核设计者设置的触发门限被存储在对应的ThresholdRise LevelX（X：0-4），当温度发生从一个门限跨到另一个门限时会触发中断，温度上升和下降都将产生中断，产生中断后会执行中断服务函数。

一个非常重要的结构体

arch/arm/mach-exynos/board-tf4-power.c

这个结构体非常重要，温度和频率策略就在这里确定了，后面会无数次提到该结构体。该结构体定义于platform 设备侧。

1162 static struct exynos_tmu_platform_data exynos5_tmu_data = { 
1163     .trigger_levels[0] = 70, 
1164     .trigger_levels[1] = 80, 
1165     .trigger_levels[2] = 90, 
1166     .trigger_levels[3] = 100, 
1167     .trigger_level0_en = 1, 
1168     .trigger_level1_en = 1, 
1169     .trigger_level2_en = 1, 
1170     .trigger_level3_en = 1, 
1171     .gain = 5, 
1172     .reference_voltage = 16, 
1173     .noise_cancel_mode = 4, 
1174     .cal_type = TYPE_ONE_POINT_TRIMMING, 
1175     .efuse_value = 55, 
1176     .freq_tab[0] = { 
1177         .freq_clip_max = 1600 * 1000, 
1178         .temp_level = 70, 
1179     }, 
1180     .freq_tab[1] = { 
1181         .freq_clip_max = 1200 * 1000, 
1182         .temp_level = 80, 
1183     }, 
1184     .freq_tab[2] = { 
1185         .freq_clip_max = 1000 * 1000, 
1186         .temp_level = 90, 
1187     }, 
1188     .freq_tab[3] = { 
1189         .freq_clip_max = 900 * 1000, 
1190         .temp_level = 100, 
1191     }, 
1192     .freq_tab[4] = { 
1193         .freq_clip_max = 400 * 1000, 
1194         .temp_level = 115, 
1195     }, 
1196     .size[THERMAL_TRIP_ACTIVE] = 1, 
1197     .size[THERMAL_TRIP_PASSIVE] = 3, 
1198     .size[THERMAL_TRIP_HOT] = 1, 
1199     .freq_tab_count = 5, 
1200     .type = SOC_ARCH_EXYNOS5, 
1201 };

该结构体的相关字段非常重要的，trigger_levels[4]是一个具有四个整数的数组，该数组对应的物理器件是ThresholdRise Level X和Thresholdfail Level X，

trigger_level0_en表明是否起用对应的trigger_levels，等于1表明启用。

gain是温度产生模块的放大倍数。

reference_voltage是温度产生模块的参考电压。

noise_cancel_mode:噪声消除模式。

type是SOC的类型

efuse_value平台定义的保险丝值，熔断值。

cal_type温度校准的方法

freq_tab表示频率值

TMU驱动的注册

要理解5410平台的TMU机制，得找一个切入口，这个切入口从TMU驱动的注册流程开始。

在drivers/thermal/exynos_thermal.c文件

1542 static struct platform_driver exynos_tmu_driver = { 
1543     .driver = { 
1544         .name   = "exynos-tmu", 
1545         .owner  = THIS_MODULE, 
1546         .of_match_table = exynos_tmu_match, 
1547     }, 
1548     .probe = exynos_tmu_probe, 
1549     .remove = __devexit_p(exynos_tmu_remove), 
1550     .suspend = exynos_tmu_suspend, 
1551     .resume = exynos_tmu_resume, 
1552     .id_table = exynos_tmu_driver_ids, 
1553 }; 
1554 
1555 module_platform_driver(exynos_tmu_driver);

上面的代码对于用于注册一个platform驱动，由于linux内核驱动程序会挂载到相应的总线上，bus的名字有很多种，如pci，usb，platform，这里注册的驱动会在/sys/bus/platform/drivers目录下有其信息。

linux下驱动，有platformdriver就要有platformdevice，platformdevice后面再看，这里先看driver的probe过程。

1544行指定了driver的name，该name将会在/sys/bus/platform/drivers/目录下显示。.probe=exynos_tmu_probe用于注册驱动。从module_platform_driver到probe的调用，会经过如下的函数；driver_register->bus_add_driver->driver_attach->bus_for_each_dev->__driver_attach->driver_probe_device->platform_drv_probe;上面的过程一个重要的功能是找注册的platform_device，该设备将会作为参数传递给exynos_tmu_probe函数。

exynos_tmu_probe函数有点长，这里将其按照功能进行分段讲解。

第1315行用于获取开篇介绍的staticstruct exynos_tmu_platform_dataexynos5_tmu_data结构体。该行非常重要，调频策略就是依靠该结构体里的信息的。

1333 INIT_WORK(&data->irq_work, exynos_tmu_work);

第1333初始化一个工作队列，回调函数是exynos_tmu_work。该workqueue会被源于TMU送过来的温度中断回调。该workqueue的调用对应与一次温度调节的开始。

1335     for (i = 0; i < EXYNOS_TMU_COUNT; i++) { 
1336         data->irq[i] = platform_get_irq(pdev, i); 
1337         if (data->irq[i] < 0) { 
1338             ret = data->irq[i]; 
1339             dev_err(&pdev->dev, "Failed to get platform irq\n"); 
1340             goto err_free; 
1341         } 
1342 
1343         data->mem[i] = platform_get_resource(pdev, IORESOURCE_MEM, i); 
1344         if (!data->mem[i]) { 
1345             ret = -ENOENT; 
1346             dev_err(&pdev->dev, "Failed to get platform resource\n"); 
1347             goto err_free; 
1348         } 
1349 
1350         data->mem[i] = request_mem_region(data->mem[i]->start, 
1351                 resource_size(data->mem[i]), pdev->name); 
1352         if (!data->mem[i]) { 
1353             ret = -ENODEV; 
1354             dev_err(&pdev->dev, "Failed to request memory region\n"); 
1355             goto err_free; 
1356         } 
1357 
1358         data->base[i] = ioremap(data->mem[i]->start, resource_size(data->mem[i])); 
1359         if (!data->base[i]) { 
1360             ret = -ENODEV; 
1361             dev_err(&pdev->dev, "Failed to ioremap memory\n"); 
1362             goto err_mem_region; 
1363         } 
1364 
1365         ret = request_irq(data->irq[i], exynos_tmu_irq, 
1366                 IRQF_TRIGGER_RISING, "exynos-tmu", data); 
1367         if (ret) { 
1368             dev_err(&pdev->dev, "Failed to request irq: %d\n", data->irq[i]); 
1369             goto err_io_remap; 
1370         } 
1371     }

1335～1371行，该文件的开始定义EXYNOS_TMU_COUNT等于4，对应4个TMU（ThermalManagementUnit），这四个TMU分别用于监控CPU的温度。获取其对应的中断号/内存资源，内存范围/io地址空间，并根据获得的中断号，注册中断服务函数exynos_tmu_irq。该中断函数会调用1333行初始workqueue。

//获取对应的时钟资源，该资源定义于clock-exynos5410.c文件。

1373     data->clk = clk_get(NULL, "tmu_apbif"); 
1374     if (IS_ERR(data->clk)) { 
1375         ret = PTR_ERR(data->clk); 
1376         dev_err(&pdev->dev, "Failed to get clock\n"); 
1377         goto err_irq; 
1378     } 
1379 
1380     if (pdata->type == SOC_ARCH_EXYNOS5 ||   
1381                 pdata->type == SOC_ARCH_EXYNOS4) 
1382         data->soc = pdata->type; 
1383     else { 
1384         ret = -EINVAL; 
1385         dev_err(&pdev->dev, "Platform not supported\n"); 
1386         goto err_clk; 
1387     } 

1380行的 SOC_ARCH_EXYNOS5在platform设备里确实注册了。

1389     data->pdata = pdata; 
1390     platform_set_drvdata(pdev, data);

1389～~1390行这里将platform设备传递来的信息存储在platform_device里。

1393     for (i = 0; i < EXYNOS_TMU_COUNT; i++) { 
1394         ret = exynos_tmu_initialize(pdev, i); 
1395         if (ret) { 
1396             dev_err(&pdev->dev, "Failed to initialize TMU[%d]\n", i); 
1397             goto err_clk; 
1398         } 
1399 
1400         exynos_tmu_control(pdev, i, true); 
1401     }

1393~1401行设置TMU相关的寄存器，这两个函数和平台硬件是息息相关的。这里只要知道设置完了以后TMU就可以根据platformdevice里要求的温度，CPU当前的温度正确产生中断了。

1403 /*Register the sensor with thermal management interface*/

1404 (&exynos_sensor_conf)->private_data = data;

1405 exynos_sensor_conf.trip_data.trip_count =pdata->trigger_level0_en +

1406 pdata->trigger_level1_en + pdata->trigger_level2_en+

1407 pdata->trigger_level3_en;

1408

1405~1407行，根据平台设备传递的信息初始化conf.trip_data.trip_count，根据开篇给出的结构体，其值将被初始化为4

1409     for (i = 0; i < exynos_sensor_conf.trip_data.trip_count; i++) { 
1410         exynos_sensor_conf.trip_data.trip_val[i] = 
1411             pdata->threshold + pdata->trigger_levels[i]; 
1412         exynos_sensor_conf.trip_data.boost_trip_val[i] = 
1413             pdata->threshold + pdata->boost_trigger_levels[i]; 
1414     }

1409～1414行仍然是根据开篇给出的结构体，初始化trip_val

1415 
1416     exynos_sensor_conf.cooling_data.freq_clip_count = 
1417                         pdata->freq_tab_count; 
1418     exynos_sensor_conf.cooling_data.boost_mode = 0; 
1419 
1420     for (i = 0; i < pdata->freq_tab_count; i++) { 
1421         exynos_sensor_conf.cooling_data.freq_data[i].freq_clip_max = 
1422                     pdata->freq_tab[i].freq_clip_max; 
1423         exynos_sensor_conf.cooling_data.freq_data[i].temp_level = 
1424                     pdata->freq_tab[i].temp_level; 
1425         exynos_sensor_conf.cooling_data.freq_data[i].mask_val = cpu_all_mask; 
1426 
1427         exynos_sensor_conf.cooling_data.boost_freq_data[i].freq_clip_max = 
1428                     pdata->boost_freq_tab[i].freq_clip_max; 
1429         exynos_sensor_conf.cooling_data.boost_freq_data[i].temp_level = 
1430                     pdata->boost_freq_tab[i].temp_level; 
1431         exynos_sensor_conf.cooling_data.boost_freq_data[i].mask_val = cpu_all_mask; 
1432 
1433         exynos_sensor_conf.cooling_data.size[i] = 
1434                     pdata->size[i]; 
1435     } 
1436 
1437 #ifdef THERMAL_WINDOWS 
1438     exynos_sensor_conf.cooling_data.WinAttr[0] = WIN_NORMAL; // For active mode (105) 
1439     exynos_sensor_conf.cooling_data.WinAttr[1] = WIN_WARN; // For passive - i mode (110) 
1440     exynos_sensor_conf.cooling_data.WinAttr[2] = WIN_WARN;   // For passive - ii mode (112) 
1441     exynos_sensor_conf.cooling_data.WinAttr[3] = WIN_CRIT;   // For passive - iii mode (114) 
1442     exynos_sensor_conf.cooling_data.WinAttr[4] = WIN_CRIT;   // For hot mode (115) 
1443 #endif // endif THERMAL_WINDOWS 
1444

1403~1443，将exynos5_tmu_data结构体里的内容始化到TMI里，exynos_sensor_conf的thermal_cooling_conf结构体成员里存放了exynos5_tmu_data结构体的大部分信息。这个结构体还是很重要的，重要性提现在后文的1448行。其各个字段的初始化可以看成对开篇给出的那个结构体的拷贝。

1437～1443行WinAttr[0]是针对105度时的调节策略，即105度使用四个核中最小核作为判断的依据（WIN_NORMAL），WIN_WARN指平均温度，WIN_CRIT指最大温度。

1448     ret = exynos_register_thermal(&exynos_sensor_conf); 
1449     if (ret) { 
1450         dev_err(&pdev->dev, "Failed to register thermal interface\n"); 
1451         goto err_clk; 
1452     }

1448行函数非常重要，注册内核thermal管理。

1454     th_zone->exynos4_dev = pdev; 
1455 
1456     ret = sysfs_create_group(&pdev->dev.kobj, &exynos_thermal_sensor_attr_group);
1457     if (ret) 
1458         dev_err(&pdev->dev, "cannot create thermal sensor attributes\n"); 
1459 
...
1481 
1482     return ret; 
1483 }

1454~1483是exynos_tmu_probe一些扫尾和出错处理，直接跳过了。

接着exynos_register_thermal函数继续。

该函数依然在exynos_thermal.c文件。

 629 /* Register with the in-kernel thermal management */ 
 630 static int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf) 
 631 { 
 632     int ret, count, tab_size, pos = 0; 
 633     struct freq_clip_table *tab_ptr, *clip_data; 

 640     th_zone = kzalloc(sizeof(struct exynos_thermal_zone), GFP_KERNEL); 
 641     if (!th_zone) 
 642         return -ENOMEM;

第640行exynos_thermal_zone结构体在这里申请了。后文会有一张数据结构拓扑图。

644 th_zone->sensor_conf = sensor_conf;

第644行，将th_zone的sensor_conf成员和传递进来的成员相关联。

646 tab_ptr = (struct freq_clip_table*)sensor_conf->cooling_data.freq_data;

 649     for (count = 0; count < EXYNOS_ZONE_COUNT; count++) { 
 650         tab_size = sensor_conf->cooling_data.size[count]; 
 651         if (tab_size == 0) 
 652             continue; 
 653 
 654         clip_data = (struct freq_clip_table *)&(tab_ptr[pos]); 
 655 
 656 #ifdef CONFIG_CPU_THERMAL 
 657         th_zone->cool_dev[count] = cpufreq_cooling_register( 
 658                         clip_data, tab_size); 
 659 #endif 
 660         pos += tab_size; 
 661 
 662         if (IS_ERR(th_zone->cool_dev[count])) { 
 663             pr_err("Failed to register cpufreq cooling device\n"); 
 664             ret = -EINVAL; 
 665             th_zone->cool_dev_size = count; 
 666             goto err_unregister; 
 667         } 
 668     }

649行的循环会进行三次，ZONE被分成三个，分别是ACTIVE，PASSIVE，HOT，这三者的区别如下：

如果初始主频是1.6G，在ACTIVE模式下初始主频1.6GHz是不变，调频只受ondemand或者其它调频策略影响，这种状态下温度可以说是比较良好的。

在PASSIVE模式下，温度上升已经不能忽视了，需要进行限制了，限制的方法就是将最高主频降低，由1.6G向下降，至于降多少是由一开始的数据结构决定的，根据一开始的那个数据结构可知，在该模式下系统最高主频只能到1.2GHz，不论何种负载这是最高的主频了。

对于HOT，这时温度已经比较高了，需要进一步限制，在向下上将是CRITICAL，如果幸运没有宕机，这时5410会永久将主频永久限制在某一频率上，默认是800MHz。

回到650行，根据一开始的1196~1198行，可以知道这三。次的迭代tab_size的值将是1,3,1。这正好对应了五个freq_tab表。

1196     .size[THERMAL_TRIP_ACTIVE] = 1, 
1197     .size[THERMAL_TRIP_PASSIVE] = 3, 
1198     .size[THERMAL_TRIP_HOT] = 1,

645～660行，针对platform设备里的几张来注册cooling设备。660行将freq_tab[0]，freq_tab[1]～freq_tab[3]，freq_tab[4]分三次注册。

1176     .freq_tab[0] = { 
1177         .freq_clip_max = 1600 * 1000, 
1178         .temp_level = 70, 
1179     }, 
1180     .freq_tab[1] = { 
1181         .freq_clip_max = 1200 * 1000, 
1182         .temp_level = 80, 
1183     }, 
1184     .freq_tab[2] = { 
1185         .freq_clip_max = 1000 * 1000, 
1186         .temp_level = 90, 
1187     }, 
1188     .freq_tab[3] = { 
1189         .freq_clip_max = 900 * 1000, 
1190         .temp_level = 100, 
1191     }, 
1192     .freq_tab[4] = { 
1193         .freq_clip_max = 400 * 1000, 
1194         .temp_level = 115, 
1195     },

657行注册cpufreq_cooling_register是比较重要的，前面反复提到的降频，就和这里的有关系。这个函数后面再看，先把这个函数看完。

 671     th_zone->therm_dev = thermal_zone_device_register(sensor_conf->name, 
 672             EXYNOS_ZONE_COUNT, 7, NULL, &exynos_dev_ops, 1, 1, PASSIVE_INTERVAL, 
 673             IDLE_INTERVAL);

671行注册EXYNOS_ZONE_COUNT个thermalzone，其第一个name参数是‘exynos-therm’，参数7是一个bitmask，指示可写的trip温度点。

后面的两个1是passive模式计算温度的系数。PASSIVE_INTERVAL是passive模式调节频率的两次轮训的间隔，默认是100毫秒。IDLE_INTERVAL指示两次轮询温度设置点是否被跨越的时间间隔，单位也是毫秒，如果等于0，则表示使用中断工作方式。

这些是用于更新如下的一些字段的。

/sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
/sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq

再回到cpufreq_cooling_register这一比较重要的函数。

386 struct thermal_cooling_device *cpufreq_cooling_register( 
387     struct freq_clip_table *tab_ptr, unsigned int tab_size) 
388 { 
389     struct thermal_cooling_device *cool_dev; 
390     struct cpufreq_cooling_device *cpufreq_dev = NULL; 
391     struct freq_clip_table *clip_tab; 
392     unsigned int cpufreq_dev_count = 0; 
393     char dev_name[THERMAL_NAME_LENGTH]; 
394     int ret = 0, id = 0, i; 

399     list_for_each_entry(cpufreq_dev, &cooling_cpufreq_list, node) 
400         cpufreq_dev_count++; 
401 
402     cpufreq_dev = kzalloc(sizeof(struct cpufreq_cooling_device), 
403             GFP_KERNEL); 
404     if (!cpufreq_dev) 
405         return ERR_PTR(-ENOMEM); 
406 
...
424 
425     cpufreq_dev->tab_ptr = tab_ptr; 
426     cpufreq_dev->tab_size = tab_size; 
427 
428     ret = get_idr(&cpufreq_idr, &cpufreq_dev->id); 
429     if (ret) { 
430         kfree(cpufreq_dev); 
431         return ERR_PTR(-EINVAL); 
432     } 
433 
434     sprintf(dev_name, "thermal-cpufreq-%d", cpufreq_dev->id);

386~434行，这里省去了一些参数的合法性检查。这个函数总共被调用三次，每个zone会调用该函数注册该zone里的cooling设备。所以400行对应这三次的调用的值将分别是0,1,2.

第428行，434行，为设备选择名称，由于这之前没有注册过，所以三个设备名将一次是thermal-cpufreq-0，thermal-cpufreq-1，thermal-cpufreq-2。cpufreq_dev在三次调用时的情况如下：

cpufreq_dev->tab_ptr = tab_ptr;//指向1176
cpufreq_dev->tab_size = 1;
1176     .freq_tab[0] = { 
1177         .freq_clip_max = 1600 * 1000, 
1178         .temp_level = 70, 
1179     }, 
cpufreq_dev->tab_ptr = tab_ptr;//指向1180
cpufreq_dev->tab_size = 3;
1180     .freq_tab[1] = { 
1181         .freq_clip_max = 1200 * 1000, 
1182         .temp_level = 80, 
1183     }, 
1184     .freq_tab[2] = { 
1185         .freq_clip_max = 1000 * 1000, 
1186         .temp_level = 90, 
1187     }, 
1188     .freq_tab[3] = { 
1189         .freq_clip_max = 900 * 1000, 
1190         .temp_level = 100, 
1191     }, 
cpufreq_dev->tab_ptr = tab_ptr;//指向1192行
cpufreq_dev->tab_size = 1;
1192     .freq_tab[4] = { 
1193         .freq_clip_max = 400 * 1000, 
1194         .temp_level = 115, 
1195     },

第二个参数是cooling方法，该方法也定义在cpu_cooling.c文件中。

进入cooling 策略

static struct thermal_cooling_device_ops const cpufreq_cooling_ops = {
	.get_max_state = cpufreq_get_max_state,
	.get_cur_state = cpufreq_get_cur_state,
	.set_cur_state = cpufreq_set_cur_state,
}; 

443     cpufreq_dev->id = id; 
444     cpufreq_dev->cool_dev = cool_dev; 
445     mutex_lock(&cooling_cpufreq_lock); 
446     list_add_tail(&cpufreq_dev->node, &cooling_cpufreq_list); 
447 
448     /*Register the notifier for first cpufreq cooling device*/ 
449     if (cpufreq_dev_count == 0) 
450         cpufreq_register_notifier(&thermal_cpufreq_notifier_block, 
451                         CPUFREQ_POLICY_NOTIFIER);

446行将设备添加到cooling_cpufreq_list链表，这里才提现400行++的意义。

450行为第一个cpufreqcooling设备注册一个notifier，该函数是ACTIVE模式下的notifer。

thermal_cooling_device_register定义于drivers/thermal/thermal_sys.c。通过该文件名大概可以猜测到该函数的意义。

/sys/class/thermal/cooling_device0 
/sys/class/thermal/cooling_device1 
/sys/class/thermal/cooling_device2 

/sys/class/thermal/cooling_device0/cur_state 
/sys/class/thermal/cooling_device0/max_state 
/sys/class/thermal/cooling_device1/cur_state 
/sys/class/thermal/cooling_device1/max_state 
/sys/class/thermal/cooling_device2/cur_state 
/sys/class/thermal/cooling_device2/max_state

thermal_zone_device_register，需要注意的是该函数1422行注册了一个workqueue，该workqueue会被周期性调用以更新thermalzone里的设备。

1422     INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check); 
1423 
1424     thermal_zone_device_update(tz);

thermal_zone_device_update会被周期性调用。

至此注册的流程完成了。

在exynos_tmu_probe里注册一个exynos_tmu_irq中断服务函数，并且也初始化了一个workqueue，即exynos_tmu_work。

1092 static void exynos_tmu_work(struct work_struct *work) 
1093 { 
1094     struct exynos_tmu_data *data = container_of(work, 
1095             struct exynos_tmu_data, irq_work); 
1096 
1097     int i; 
1098 
1099     mutex_lock(&data->lock); 
1100     clk_enable(data->clk); 
1101 
1102 
1103     if (data->soc == SOC_ARCH_EXYNOS5) { 
1104         for (i = 0; i < EXYNOS_TMU_COUNT; i++) { 
1105             writel(EXYNOS5_TMU_CLEAR_RISE_INT|EXYNOS5_TMU_CLEAR_FALL_INT, 
1106                     data->base[i] + EXYNOS_TMU_REG_INTCLEAR); 
1107         } 
1108     } else { 
1109         writel(EXYNOS4_TMU_INTCLEAR_VAL, 
1110                 data->base + EXYNOS_TMU_REG_INTCLEAR); 
1111     } 
1112 
1113     clk_disable(data->clk); 
1114     mutex_unlock(&data->lock); 
1115     exynos_report_trigger(); 
1116     for (i = 0; i < EXYNOS_TMU_COUNT; i++) 
1117         enable_irq(data->irq[i]); 
1118 }

1103~1110行关中断

115行调用exynos_report_trigger处理。

 246 static void exynos_report_trigger(void) 
 247 { 
 248     unsigned int i; 
 249     char data[10]; 
 250     char *envp[] = { data, NULL }; 
 251 
 252     if (!th_zone || !th_zone->therm_dev) 
 253         return; 
 254 
 255     thermal_zone_device_update(th_zone->therm_dev); 
 256 
 257     mutex_lock(&th_zone->therm_dev->lock); 
 258     /* Find the level for which trip happened */ 
 259     for (i = 0; i < th_zone->sensor_conf->trip_data.trip_count; i++) { 
 260         if (th_zone->therm_dev->last_temperature < 
 261             th_zone->sensor_conf->trip_data.trip_val[i] * MCELSIUS) 
 262             break; 
 263     } 
 264 
 265     pr_debug("[TMU-IRQ] IRQ mode=%d\n",i); 
 266     if (th_zone->mode == THERMAL_DEVICE_ENABLED) { 
 267         if (i > 0) 
 268             th_zone->therm_dev->polling_delay = ACTIVE_INTERVAL; 
 269         else 
 270             th_zone->therm_dev->polling_delay = IDLE_INTERVAL; 
 271     } 
 272 
 273     snprintf(data, sizeof(data), "%u", i); 
 274     kobject_uevent_env(&th_zone->therm_dev->device.kobj, KOBJ_CHANGE, envp); 
 275     mutex_unlock(&th_zone->therm_dev->lock); 
 276 }

255行thermal_zone_device_update是一个关键的函数，就算irq没有发生，该函数也将由thermal_zone_device_check周期性调度。274行使用uevent方法向用户空间报告此事。

thermal_zone_device_update函数中将会遇到若干函数指针，这里先将这些函数指针列出来。

定义于drivers/thermal/exynos_thermal.c里的指针。

/* Operation callback functions for thermal zone */ 
static struct thermal_zone_device_ops const exynos_dev_ops = { 
    .bind = exynos_bind, 
    .unbind = exynos_unbind, 
    .get_temp = exynos_get_temp, 
    .get_mode = exynos_get_mode, 
    .set_mode = exynos_set_mode, 
    .get_trip_type = exynos_get_trip_type, 
    .get_trip_temp = exynos_get_trip_temp, 
    .set_trip_temp = exynos_set_trip_temp, 
    .get_trip_temp_level = exynos_get_trip_temp_level, 
    .set_trip_temp_level = exynos_set_trip_temp_level, 
    .get_trip_freq = exynos_get_trip_freq, 
    .set_trip_freq = exynos_set_trip_freq, 
    .get_boost_mode = exynos_get_boost_mode, 
    .set_boost_mode = exynos_set_boost_mode, 
    .get_crit_temp = exynos_get_crit_temp, 
    .notify = exynos_notify, 
};

定义于drivers/thermal/cpu_cooling.c里的指针。

/*Bind cpufreq callbacks to thermal cooling device ops*/ 
static struct thermal_cooling_device_ops const cpufreq_cooling_ops = { 
    .get_max_state = cpufreq_get_max_state, 
    .get_cur_state = cpufreq_get_cur_state, 
    .set_cur_state = cpufreq_set_cur_state, 
};

定义与drivers/thermal/thermal_sys.c，该函数还是属于thermal框架的，所以针对平台的一些操作就使用传递进来的函数指针进行操作。这样跟换平台时，只需要替换前面的函数指针集，而不需要修改框架。

1124 void thermal_zone_device_update(struct thermal_zone_device *tz) 
1125 { 
1126     int count, ret = 0; 
1127     long temp, trip_temp; 
1128     enum thermal_trip_type trip_type; 
1129     struct thermal_cooling_device_instance *instance; 
1130     struct thermal_cooling_device *cdev; 
1131     mutex_lock(&tz->lock); 
1132     
1133     if (tz->ops->get_temp(tz, &temp)) { 
1134         /* get_temp failed - retry it later */ 
1135         pr_warn("failed to read out thermal zone %d\n", tz->id); 
1136         goto leave; 
1137     }

1133行用于读取当前的温度，对应使用的函数是上面的函数指针指向的函数exynos_get_temp，该函数返回的温度和前面设置的温度窗口是有关系的，根据zone类型返回最低，平均以及最高三种温度。

1139     for (count = 0; count < tz->trips; count++) { 
1140         tz->ops->get_trip_type(tz, count, &trip_type); 
1141         tz->ops->get_trip_temp(tz, count, &trip_temp); 
1142         
1143         switch (trip_type) { 
1144         case THERMAL_TRIP_CRITICAL: 
1145             if (temp >= trip_temp) { 
1146                 pr_info("[TMU] CRITICAL: Need tripping\n"); 
1147                 if (tz->ops->notify) 
1148                     ret = tz->ops->notify(tz, count, 
1149                                   trip_type); 
1150                 if (ret) {        
1151                     pr_emerg("Critical temperature reached (%ld C), shutting down\n", 
1152                          temp/1000); 
1153                     orderly_poweroff(true); 
1154                 } 
1155             } 
1156             break; 
1157         case THERMAL_TRIP_HOT: 
1158             if (temp >= trip_temp) 
1159                 if (tz->ops->notify) 
1160                     tz->ops->notify(tz, count, trip_type); 
1161             break; 
1162         case THERMAL_TRIP_ACTIVE: 
1163             list_for_each_entry(instance, &tz->cooling_devices, 
1164                         node) { 
1165                 if (instance->trip != count) 
1166                     continue; 
1167                     
1168                 cdev = instance->cdev; 
1169                 
1170                 if (temp >= trip_temp) 
1171                     cdev->ops->set_cur_state(cdev, 1); 
1172                 else 
1173                     cdev->ops->set_cur_state(cdev, 0); 
1174 #ifdef PERCORE_LOGGING 
1175                 gMaxfreq = cdev->ops->get_max_state(cdev, &dummy_maxstate); 
1176 #endif 
1177             } 
1178             break; 
1179         case THERMAL_TRIP_PASSIVE: 
1180             if (temp >= trip_temp || tz->passive) 
1181                 thermal_zone_device_passive(tz, temp, 
1182                                 trip_temp, count); 
1183             break; 
1184         } 
1185     }

其核心处理集中在这个for循环里，1139行，tz->trips的值是3，将迭代三次，这对应于开篇那个数据结构体的三个trip温度，分别是70,80,90。

140~141行根据count的值将有三种情况：

count= 0对应于MONITOR_ZONE，类型将是THERMAL_TRIP_ACTIVE，trip_temp是70度

count= 1对应于WARN_ZONE，类型将是THERMAL_TRIP_PASSIVE，trip_temp是80度

count= 2对应于PANIC_ZONE，类型将是THERMAL_TRIP_CRITICAL，trip_temp是90度

还有一个THERMAL_TRIP_HOT类型，这里暂时没有用到。

对应1143行的switch语句，首先进入的是1162~1168行的case语句，如果temp（调用exynos_get_temp获得的温度），如果该温度大于开篇结构体中设置的70度，则会调用cpufreq_set_cur_state将当前的state设置成1，具体完成设置的工作留到了cpufreq_apply_cooling(cpufreq_device,state);函数完成。如果温度低于70度，则说明非常安全，直接将state计数清零。

第二次进入THERMAL_TRIP_PASSIVE，即1179~1183行。if于君判断温度是否大于设定的第二个门限80，，如果大于说明，则表明不能在放任ondemand频率调节了，需要限制最高主频。thermal_zone_device_passive(tz,temp, trip_temp, count);

第三次进入THERMAL_TRIP_CRITICAL，说明这时温度非常高，如果if语句仍然成立，则需要紧急处理，这里其会判断是否到119度，如果到了，会down掉电源，进行保护。

现在有两条路了，一条是THERMAL_TRIP_PASSIVE调用thermal_zone_device_passive，一条是THERMAL_TRIP_ACTIVE下调用cpufreq_set_cur_state设置状态，这两种最终都将汇集到pufreq_apply_cooling(cpufreq_device,state);函数，先从THERMAL_TRIP_PASSIVE情况开始。

763 static void thermal_zone_device_passive(struct thermal_zone_device *tz, 
 764                     int temp, int trip_temp, int trip) 
 765 { 
...
 778     if (temp >= trip_temp) { 
 779         tz->passive = true; 
 780 
 781         trend = (tz->tc1 * (temp - tz->last_temperature)) + 
 782             (tz->tc2 * (temp - trip_temp)); 
 783 
 784         /* Heating up? */ 
 785         if (trend > 0) { 
 786             pr_debug("[TMU] PASSIVE: Trend up, temp=%d\n", temp); 
 787             list_for_each_entry(instance, &tz->cooling_devices, 
 788                         node) { 
 789                 if (instance->trip != trip) 
 790                     continue; 
 791                 cdev = instance->cdev; 
 792                 cdev->ops->get_cur_state(cdev, &state); 
 793                 cdev->ops->get_max_state(cdev, &max_state); 
 794                 if (state++ < max_state){ 
 795                     cdev->ops->set_cur_state(cdev, state); 
...
 799                 } 
 800             } 
 801         } else if (trend < 0) { /* Cooling off? */ 
 802             pr_debug("[TMU] PASSIVE: Trend down, temp=%d\n", temp); 
 803             list_for_each_entry(instance, &tz->cooling_devices, 
 804                         node) { 
 805                 if (instance->trip != trip) 
 806                     continue; 
 807                 cdev = instance->cdev; 
 808                 cdev->ops->get_cur_state(cdev, &state); 
 809                 cdev->ops->get_max_state(cdev, &max_state); 
 810                 if (state > 0) { 
 811                     pr_info("[TMU] PASSIVE: Below passive trip temp=%d\n", temp); 
 812                     cdev->ops->set_cur_state(cdev, --state); 
...
 816                 } 
 817             } 
 818         } 
 819         return; 
 820     }

778~820行的代码处理的是当读取的温度高于设置的温度门限的处理方法，注意一旦该if语句成立，到819行将会返回，后面的将不会再被处理到。

781行的trend表示的是温度有上升还是下降趋势，在前面的初始化时将tc1和tc2初始化成了1。这里需要举个例子说明trend的意义。

假如门限是80度，当前温度是83度，上一次进入该函数时温度是85度，则tz->last_temperature，则trend将等于-1，表示温度有下降的趋势，从83度小于85度可以看出这一趋势。

但是都大于80度，都是在passive情况下。这也是784行和801行注释的意义。

787~795找到对应的cooling设备，如果当前的state还没有大于最大的state，则将state加1，并调用cpufreq_set_cur_state设置state，这和ACTIVE方式调用的接口一致。

如果是Coolingoff，则将state减一，并调用cpufreq_set_cur_state设置state。

话说回来如果温度小于WARN门限值，则直接将state的状态减一后在设置就可以了。state对应于开篇的那张表的门限值的个数，到2时主频只能到1G了。

 829     list_for_each_entry(instance, &tz->cooling_devices, node) { 
 830         if (instance->trip != trip) 
 831             continue; 
 832         cdev = instance->cdev; 
 833         cdev->ops->get_cur_state(cdev, &state); 
 834         cdev->ops->get_max_state(cdev, &max_state); 
 835         if (state > 0) 
 836             cdev->ops->set_cur_state(cdev, --state); 
 837         if (state == 0) 
 838             tz->passive = false; 
 839 
...
 844     } 
 845 }

不论TMU处于何种情况，总会调用cpufreq_set_cur_state函数，该函数简单对cpufreq_apply_cooling的封装。

343 static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, 
344                  unsigned long state) 
345 { 
346     int ret = -EINVAL; 
347     struct cpufreq_cooling_device *cpufreq_device; 
348 
349     mutex_lock(&cooling_cpufreq_lock); 
350     list_for_each_entry(cpufreq_device, &cooling_cpufreq_list, node) { 
351         if (cpufreq_device && cpufreq_device->cool_dev == cdev) { 
352             ret = 0; 
353             break; 
354         } 
355     } 
356     if (!ret) 
357         ret = cpufreq_apply_cooling(cpufreq_device, state); 
358 
359     mutex_unlock(&cooling_cpufreq_lock); 
360 
361     return ret; 
362 }

250~355行，用于寻找cooling设备，找到cooling设备，然后对该设备调用cpufreq_apply_cooling调节state。

181 static int cpufreq_apply_cooling(struct cpufreq_cooling_device *cpufreq_device, 
182                 unsigned long cooling_state) 
183 { 
184     unsigned int event, cpuid, state; 
185     struct freq_clip_table *th_table, *table_ptr; 
186     const struct cpumask *maskPtr = &cpufreq_device->allowed_cpus; 
187     struct cpufreq_cooling_device *cpufreq_ptr; 
188 
189     if (cooling_state > cpufreq_device->tab_size) 
190         return -EINVAL; 
191 
192     /*Check if the old cooling action is same as new cooling action*/ 
193     if (cpufreq_device->cpufreq_state == cooling_state) 
194         return 0; 
195         
196     /*pass cooling table info to the cpufreq_thermal_notifier callback*/ 
197     notify_table = NOTIFY_INVALID; 
198     
199     if (cooling_state > 0) { 
200         th_table = &(cpufreq_device->tab_ptr[cooling_state - 1]); 
201         notify_table = th_table; 
202     }

193~194如果所处的coolingstate和像调节状态是一样，则不需要再调节，直接返回。

197~202根据cooling_state获得对应的开篇那张表，表中有两个关键的成员，freq_clip_max和temp_level。

204     /*check if any lower clip frequency active in other cpufreq_device's*/ 
205     list_for_each_entry(cpufreq_ptr, &cooling_cpufreq_list, node) { 
206 
207         state = cpufreq_ptr->cpufreq_state; 
208         if (state == 0 || cpufreq_ptr == cpufreq_device) 
209             continue; 
210 
211         if (!cpumask_equal(&cpufreq_ptr->allowed_cpus, 
212                 &cpufreq_device->allowed_cpus)) 
213             continue; 
214 
215         table_ptr = &(cpufreq_ptr->tab_ptr[state - 1]); 
216         if (notify_table == NULL || 
217                 (table_ptr->freq_clip_max < 
218                 notify_table->freq_clip_max)) 
219             notify_table =  table_ptr; 
220     }

204~220检查cooling_cpufreq_list链表上的cooling设备和notify设备的freq_clip_max之间大小，以最小的为基准。

222     cpufreq_device->cpufreq_state = cooling_state;

保存coolingstate

224     if (notify_table != NOTIFY_INVALID) { 
225         event = CPUFREQ_COOLING_START; 
226         maskPtr = notify_table->mask_val; 
227     } else { 
228         event = CPUFREQ_COOLING_STOP; 
229     }

初始化event类型和maskPtr，maskPtr是降频掩码，这里设置的是4，表示对5410四个核都进行相同的操作。

237    <span style="color:#FF6666;"> blocking_notifier_call_chain(</span>&cputherm_state_notifier_list, 
238                         event, notify_table);

这里调用了通知链，通知CPUFREQ_COOLING_START事件。传递的参数是notify_table，扫描的链表是cputherm_state_notifier_list。不过并没有发现其通知了什么其它内核子系统。但是当系统除了降频降低温度外还采取其它措施来降低系统的温度，这一行的意义就非常的重要了。在系统上加一个转速可调的风扇，使用这里的notification就不需要额外注册thermalzone来实现风扇转速随温度调节。

240     for_each_cpu(cpuid, maskPtr) { 
241         if (is_cpufreq_valid(cpuid)) 
242             cpufreq_update_policy(cpuid); 
243     } 
244 
245     notify_table = NOTIFY_INVALID; 
246 
247     return 0; 
248 }

240~243跟新cpu调频策略，对每一个CPU执行相同的调频策略更新，总共四个核。

插曲，编写风扇驱动，根据温度调节转速

对于驱动编写而言，下面使用237行notifier的例子还是非常有参考意义的。

#include <linux/cpu_cooling.h>
#define CPUFREQ_COOLING_START		0

static struct notifier_block fan_notifer_block = {
    .notifier_call = s3c_fan_change_event,
};
static int __init s3c_fan_init(void)
{
	int err = 0;

	/* May malloc pwm deivce dynamically? */
	s3c_pwm0_dev.pwm = pwm_request(0, "pwm.motor");
	if (IS_ERR_VALUE((unsigned long)s3c_pwm0_dev.pwm))
		return -ENODEV;
	
	cputherm_register_notifier(&fan_notifer_block, CPUFREQ_COOLING_START);
	
	err =  misc_register(&s3c_fan_driver);
	if (err < 0) {
		return -ENODEV;
	}	
	return 0;
}

static void __exit s3c_fan_exit(void)
{
	misc_deregister(&s3c_fan_driver);
	return;
}

module_init(s3c_fan_init);
module_exit(s3c_fan_exit);

注意头文件以及cputherm_register_notifier的调用，其回调函数的第三个参数是启动structfreq_clip_table类型的变量，定义于cpu_cooling.h文件中。

cputherm_register_notifier(&fan_notifer_block, CPUFREQ_COOLING_START);

定义于cpufreq.c文件。

1735 int cpufreq_update_policy(unsigned int cpu) 
1736 { 
1737     struct cpufreq_policy *data = cpufreq_cpu_get(cpu); 
1738     struct cpufreq_policy policy; 
…
1752     memcpy(&policy, data, sizeof(struct cpufreq_policy)); 
1753     policy.min = data->user_policy.min; 
1754     policy.max = data->user_policy.max; 
1755     policy.policy = data->user_policy.policy; 
1756     policy.governor = data->user_policy.governor;

1737行获得percpu变量，policy.min值对应开篇的结构体的900M，policy.max对应于开篇的1.6G，policy.governor是ondemand策略。

1759     if (cpufreq_driver->get) { 
1760         policy.cur = cpufreq_driver->get(cpu); 
1761         if (!data->cur) { 
1762             pr_debug("Driver did not initialize current freq"); 
1763             data->cur = policy.cur; 
1764         } else { 
1765             if (data->cur != policy.cur) 
1766                 cpufreq_out_of_sync(cpu, data->cur, 
1767                                 policy.cur); 
1768         } 
1769     }

在exynos框架里，第1759行的get函数是exynos_getspeed函数，该函数用于获取该策略下的主频，该主频将和percpu变量得到的频率进行对比（第1765行），如果不相等，则1766行函数会被调用，该函数用于异常处理，即实际的频率和保存的CPU频率不一致，则需要做些处理。该函数用于完成此功能。

1760行调用了一个get函数指针，该函数定义于如下结构体。

static struct cpufreq_driver exynos_driver = {
	.flags		= CPUFREQ_STICKY,
	.verify		= exynos_verify_speed,
	.target		= exynos_target,
	.get		= exynos_getspeed,
	.init		= exynos_cpufreq_cpu_init,
	.name		= "exynos_cpufreq",
#ifdef CONFIG_PM
	.suspend	= exynos_cpufreq_suspend,
	.resume		= exynos_cpufreq_resume,
#endif

};

1771     ret = __cpufreq_set_policy(data, &policy); 
1772 
1773     unlock_policy_rwsem_write(cpu); 
1774 
1775 fail: 
1776     cpufreq_cpu_put(data); 
1777 no_policy: 
1778     return ret; 
1779 }

1771行设置cpufreq的策略。

1644 static int __cpufreq_set_policy(struct cpufreq_policy *data, 
1645                 struct cpufreq_policy *policy) 
1646 { 
…
1665     /* adjust if necessary - all reasons */ 
1666     blocking_notifier_call_chain(&cpufreq_policy_notifier_list, 
1667             CPUFREQ_ADJUST, policy); 
1668 
1669     /* adjust if necessary - hardware incompatibility*/ 
1670     blocking_notifier_call_chain(&cpufreq_policy_notifier_list, 
1671             CPUFREQ_INCOMPATIBLE, policy); 
…



1679     /* notification of the new policy */ 
1680     blocking_notifier_call_chain(&cpufreq_policy_notifier_list, 
1681             CPUFREQ_NOTIFY, policy); 


1684     data->min = policy->min; 
1685     data->max = policy->max;

data对应的是percpu变量，可以看成是CPU的当前设置，policy针对的是这次调整的设置，所以data总是跟踪policy设置

1689     if (cpufreq_driver->setpolicy) { 
1690         data->policy = policy->policy; 
1691         pr_debug("setting range\n"); 
1692         ret = cpufreq_driver->setpolicy(policy); 
1693     } else { 
1694         if (policy->governor != data->governor) { 
1695             /* save old, working values */ 
1696             struct cpufreq_governor *old_gov = data->governor; 
1697 
1698             pr_debug("governor switch\n"); 
1699 
1700             /* end old governor */ 
1701             if (data->governor) 
1702                 __cpufreq_governor(data, CPUFREQ_GOV_STOP); 
1703                 
1704             /* start new governor */ 
1705             data->governor = policy->governor; 
1706             if (__cpufreq_governor(data, CPUFREQ_GOV_START)) { 
1707                 /* new governor failed, so re-start old one */ 
1708                 pr_debug("starting governor %s failed\n", 
1709                             data->governor->name); 
1710                 if (old_gov) { 
1711                     data->governor = old_gov; 
1712                     __cpufreq_governor(data, 
1713                                CPUFREQ_GOV_START); 
1714                 }              
1715                 ret = -EINVAL; 
1716                 goto error_out; 
1717             } 
1718             /* might be a policy change, too, so fall through */ 
1719         } 
1720         pr_debug("governor: change or update limits\n"); 
1721         __cpufreq_governor(data, CPUFREQ_GOV_LIMITS); 
1722     } 
1723 
1724 error_out: 
1725     return ret; 
1726 }

1689行的cpufreq_driver->setpolicy在staticstruct cpufreq_driverexynos_driver结构体中并没有定义，所以执行else分支语句。由于governor一直都是ondemand策略，所以1694~1720是切换governor的代码将不会得到执行。所以核心代码就到1721行了。

1509 static int __cpufreq_governor(struct cpufreq_policy *policy, 
1510                     unsigned int event) 
1511 { 
1512     int ret; 
1513 
1514     /* Only must be defined when default governor is known to have latency 
1515        restrictions, like e.g. conservative or ondemand. 
1516        That this is the case is already ensured in Kconfig 
1517     */ 
1518 #ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE 
1519     struct cpufreq_governor *gov = &cpufreq_gov_performance; 
1520 #else 
1521     struct cpufreq_governor *gov = NULL; 
1522 #endif 
1523 
1524     if (policy->governor->max_transition_latency && 
1525         policy->cpuinfo.transition_latency > 
1526         policy->governor->max_transition_latency) { 
1527         if (!gov) 
1528             return -EINVAL; 
1529         else { 
1530             printk(KERN_WARNING "%s governor failed, too long" 
1531                    " transition latency of HW, fallback" 
1532                    " to %s governor\n", 
1533                    policy->governor->name, 
1534                    gov->name); 
1535             policy->governor = gov; 
1536         } 
1537     } 
1538 
1539     if (!try_module_get(policy->governor->owner)) 
1540         return -EINVAL; 
1541 
1542     pr_debug("__cpufreq_governor for CPU %u, event %u\n", 
1543                         policy->cpu, event); 
1544     ret = policy->governor->governor(policy, event); 
1545 
1546     /* we keep one module reference alive for 
1547             each CPU governed by this CPU */ 
1548     if ((event != CPUFREQ_GOV_START) || ret) 
1549         module_put(policy->governor->owner); 
1550     if ((event == CPUFREQ_GOV_STOP) && !ret) 
1551         module_put(policy->governor->owner); 
1552 
1553     return ret; 
1554 }

1518~1522行将gov赋值成performancegovernor，1524~1537判断硬件的latency，5410平台的policy->governor->max_transition_latency是10000000，policy->cpuinfo.transition_latency设置的值是100000。即hw的延迟要小于policy要求的延迟，所以这时并不需要跟换governor（1535行）。

1544行调用governor处理事件。

governor定义于cpufreq_ondemand.c文件。

struct cpufreq_governor cpufreq_gov_ondemand = {
       .name                   = "ondemand",
       .governor               = cpufreq_governor_dbs,
       .max_transition_latency = TRANSITION_LATENCY_LIMIT,
       .owner                  = THIS_MODULE,
};

传递进来的event参数是CPUFREQ_GOV_LIMITS。

1258 static int cpufreq_governor_dbs(struct cpufreq_policy *policy, 
1259                    unsigned int event) 
1260 { 
1261     unsigned int cpu = policy->cpu; 
1262     struct cpu_dbs_info_s *this_dbs_info; 
1263     unsigned int j; 
1264     int rc; 
1265 
1266     this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); 
1268     switch (event) { 
...
1352     case CPUFREQ_GOV_LIMITS: 
1353         mutex_lock(&this_dbs_info->timer_mutex); 
1354         if (policy->max < this_dbs_info->cur_policy->cur) 
1355             __cpufreq_driver_target(this_dbs_info->cur_policy, 
1356                 policy->max, CPUFREQ_RELATION_H); 
1357         else if (policy->min > this_dbs_info->cur_policy->cur) 
1358             __cpufreq_driver_target(this_dbs_info->cur_policy, 
1359                 policy->min, CPUFREQ_RELATION_L); 
1360         mutex_unlock(&this_dbs_info->timer_mutex); 
1361         break; 
1362     } 
1363     return 0; 
1364 }

case语句的逻辑也比较好理解，如果设置的策略最频率度小于当前CPU频率，说明这时候需要调整CPU频率了。如果策略频率最小值还大于CPU当前频率，说明CPU处于频率合理状态，不需要调节。

这里并不着急去看频率调节的细节末枝，TMU的调频也是依赖ondemand策略来调节的。

在ondemand策略中，注册一个work，间隔19微妙调用该函数一次。

INIT_DELAYED_WORK(&dbs_info->work, do_dbs_timer);

1175 static void do_dbs_timer(struct work_struct *work) 
1176 { 
1177     struct cpu_dbs_info_s *dbs_info = 
1178         container_of(work, struct cpu_dbs_info_s, work.work); 
1179     unsigned int cpu = dbs_info->cpu; 
1180     int sample_type = dbs_info->sample_type; 
1181 
1182     int delay; 
1183 
1184     mutex_lock(&dbs_info->timer_mutex); 
1185 
1186     /* Common NORMAL_SAMPLE setup */ 
1187     dbs_info->sample_type = DBS_NORMAL_SAMPLE; 
1188     if (!dbs_tuners_ins.powersave_bias || 
1189         sample_type == DBS_NORMAL_SAMPLE) { 
1190         dbs_check_cpu(dbs_info); 
1191         if (dbs_info->freq_lo) { 
1192             /* Setup timer for SUB_SAMPLE */ 
1193             dbs_info->sample_type = DBS_SUB_SAMPLE; 
1194             delay = dbs_info->freq_hi_jiffies; 
1195         } else { 
1196             /* We want all CPUs to do sampling nearly on 
1197              * same jiffy 
1198              */ 
1199             struct cpufreq_policy *policy = dbs_info->cur_policy; 
1200             dbs_info->rate_mult = 1; 
1201 
1202             delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate 
1203                 / dbs_info->rate_mult); 
1204 
1205             if (num_online_cpus() > 1) 
1206                 delay -= jiffies % delay; 
1207             } 
1208     } else { 
1209         __cpufreq_driver_target(dbs_info->cur_policy, 
1210             dbs_info->freq_lo, CPUFREQ_RELATION_H); 
1211         delay = dbs_info->freq_lo_jiffies; 
1212     } 
1213     schedule_delayed_work_on(cpu, &dbs_info->work, delay); 
1214     mutex_unlock(&dbs_info->timer_mutex); 
1215 }

1187~1208行，由于三星平台dbs_tuners_ins.powersave_bias的值是0，所以该判断条件成立，else语句不成立。1190行之后的if语句判断了dbs_info成员值情况，下面的逻辑依赖该值，所以其更新是非常重要的， dbs_check_cpu该函数将计算CPU的负载，ondemand频率调节策略就是根据负载情况来调节CPU主频的。ondemand调节方法是，对每一次取样（dbs_check_cpu一次调用），如果idle时间小于20%，则会尝试提高CPU主频，如果某个频率可以有30%idle，则会采取降频措施。

1213~1214再次调度该work在delay时间后运行。

关于dbs_check_cpu细节就不展开了，这里主要讲讲计算负载后调节CPU主频的方法。这里提醒一下，CPU的主频并不仅限于开篇那个数据结构，而是以900M为下限，1.6G为上限，递增100M的频率可选结构。
主频增加会调用dbs_freq_increase函数，然而没用对称的降低主频的函数，不过不论是降低主频还是提高主频都将调用 __cpufreq_driver_target函数完成的。

Drivers/cpufreq/cpufreq.c
1445 int __cpufreq_driver_target(struct cpufreq_policy *policy,
1446                 unsigned int target_freq,
1447                 unsigned int relation)
1448 {
1449     int retval = -EINVAL;
1450     if (cpufreq_disabled())
1451         return -ENODEV;
1452 
1453     pr_debug("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
1454         target_freq, relation);
1455     if (cpu_online(policy->cpu) && cpufreq_driver->target)
1456         retval = cpufreq_driver->target(policy, target_freq, relation);
1457 
1458     return retval;
1459 }
1460 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);

1456行，调用exynos_target完成接下来的任务，其共有三个参数，第一个参数指向ondemand策略，第二个参数是期望的目标频率，第三个参数是关系，关系有两层，降低频率或者提高频率，定义于cpufreq.h文件。

#define CPUFREQ_RELATION_L 0  /* lowest frequency at or above target */
#define CPUFREQ_RELATION_H 1  /* highest frequency below or at target */

从这个函数名可以看出该函数和exynos平台是息息相关的，该函数

467 /* Set clock frequency */
 468 static int exynos_target(struct cpufreq_policy *policy,
 469               unsigned int target_freq,
 470               unsigned int relation)
 471 {
 472     cluster_type cur, old_cur;
 473     unsigned int index;
 474     int count, ret = 0;
 475     bool do_switch = false;
...
 491 
 492     /* get current frequency */
 493     freqs[cur]->old = exynos_getspeed(policy->cpu);
 494 
 495     target_freq = max((unsigned int)pm_qos_request(PM_QOS_CPU_FREQ_MIN), target_freq);
 496     target_freq = min((unsigned int)pm_qos_request(PM_QOS_CPU_FREQ_MAX), target_freq);

495~496行的最大和最小频率定义于drivers/cpufreq/exynos5410-cpufreq.c文件。

pm_qos_update_request(&exynos5_cpu_int_qos,160000);

这里我将其设置为了1.6G

 498     count = num_online_cpus();
 499     target_freq = min(target_freq, exynos_info[cur]->max_op_freqs[count]);
 500 
 501     if (cpufreq_frequency_table_target(policy, merge_freq_table,
 502                 target_freq, relation, &index)) {
 503         ret = -EINVAL;
 504         goto out;
 505     }
 506 
 507     target_freq = merge_freq_table[index].frequency;
 508 
 509     if (cur == CA15 && target_freq < STEP_LEVEL_CA15_MIN)
 510         do_switch = true;
 511     else if (cur == CA7 && target_freq > STEP_LEVEL_CA7_MAX)
 512         do_switch = true;
509~512四大核四小核的调频处理，大小核不能同时工作。

 514     /* Current T/F - ARM Core Voltage Down */
 515     if((do_switch == false) && (cur == CA7) && (lp_volt_need == 1)){
 516 
 517         volt = get_match_volt(ID_ARM, ACTUAL_FREQ(freq_min[CA15], CA15));
 518         volt = get_limit_voltage(volt);
 519         regulator_set_voltage(arm_regulator, volt, volt);
 520         lp_volt_need = 0;
 521     }
 522     else if(do_switch && (cur == CA15)){
 523         prev_cur_freq = target_freq;
 524         lp_volt_need = 1;
 525     }
 526     else if(do_switch && (cur == CA7)){
 527         volt = get_match_volt(ID_ARM, ACTUAL_FREQ(prev_cur_freq, CA15));
 528         volt = get_limit_voltage(volt);
 529         regulator_set_voltage(arm_regulator, volt, volt);
 530     }
 531     /* Current T/F - ARM Core Voltage Down */

515~530行，根据大小核情况进行处理，

 532 
 533 #ifdef CONFIG_BL_SWITCHER
 534     if (do_switch) {
 535         cur = exynos_switch(policy, old_cur);
 536         if (old_cur == cur)
 537             goto out;   /* Switching failed, No operation */
 538 
 539         freqs[cur]->old = exynos_getspeed_cluster(cur);
 540         policy->cur = freqs[cur]->old;
 541     }
 542 #endif
 543     /* frequency and volt scaling */
 544     ret = exynos_cpufreq_scale(target_freq, freqs[cur]->old, policy->cpu);
 545 
 546 out:
 547     mutex_unlock(&cpufreq_lock);
 548 
 549     return ret;
 550 }

533~544行，更底层的频率调节。调用exynos5410_set_frequency_CA15，exynos5410_set_clkdiv_CA15等设置CPU主频。

这里再给出两个命令行下调节的有用命令

cat /sys/bus/platform/drivers/exynos-tmu/exynos5-tmu/temp
 cat /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq 

echo 1600000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
echo 600000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq  

cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq
cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq

 while true; do cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq;cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq;
cat /sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq; cat /sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq; 
cat /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq; sleep 1; cat /sys/bus/platform/drivers/exynos-tmu/exynos5-tmu/temp; 
sleep 1;done

最后来一张数据结构图：

高通

查看CPU温度

 while true; do cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq;
cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq;
cat /sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_cur_freq; 
cat /sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_cur_freq; 
cat /sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_cur_freq;

cat /sys/class/thermal/thermal_zone5/temp;cat /sys/class/thermal/thermal_zone6/temp;  
cat /sys/class/thermal/thermal_zone7/temp;cat /sys/class/thermal/thermal_zone8/temp;
echo '\n'; 
sleep 2;done

转载自原文链接, 如需删除请联系管理员。

原文链接：cpu 调频温度 ondemand，转载请注明来源！