ã¯ããã«
ãã®ã·ãªãŒãºã®ååã®
èšäºã§ã¯ ãããŒã¿åæã®åé¡ã«é¢ããã¹ããŒãã¡ã³ãã«ã€ããŠèª¬æããæ©æ¢°åŠç¿ã¢ãã«ã®ã»ããã¢ããã®æåã®ã¹ããããèžãã§ãã¢ããªã±ãŒã·ã§ã³ããã°ã©ããŒã䜿çšããã®ã«äŸ¿å©ãªã€ã³ã¿ãŒãã§ã€ã¹ãäœæããŸããã 仿¥ã¯ãåé¡ã®ãããªã調æ»ã宿œããŸããæ°ããæ©èœãå®éšããããè€éãªã¢ãã«ãšãã¥ãŒãã³ã°ãã©ã¡ãŒã¿ãŒã®ãªãã·ã§ã³ã詊ããŸãã

å¯èœãªéãããã®èšäºã§ã¯ãã³ãã¥ããã£ã§ç¢ºç«ãããè±èªã®çšèªãšä¿èªã®æåéãã®ç¿»èš³ã«åºã¥ããŠãèè
ãéžæãããã·ã¢èªã®çšèªã䜿çšããŠããŸãã ããªãã¯
ããã§ããã«ã€ããŠèªãããšãã§ã
ãŸã ã
ã¢ãã«ã調æŽããæ€èšŒãµã³ãã«ã§ã®äºæž¬ã®å質ãè©äŸ¡ãããšããç¹ã§æ±ºããããšãæãåºããŸãããã
çŸåšã®ã³ãŒã[research.py]
import pickle import math import numpy from sklearn.linear_model import LinearRegression TRAIN_SAMPLES_NUM = 20000 def load_data(): list_of_instances = [] list_of_labels =[] with open('./data/competition_data/train_set.csv') as input_stream: header_line = input_stream.readline() columns = header_line.strip().split(',') for line in input_stream: new_instance = dict(zip(columns[:-1], line.split(',')[:-1])) new_label = float(line.split(',')[-1]) list_of_instances.append(new_instance) list_of_labels.append(new_label) return list_of_instances, list_of_labels def is_bracket_pricing(instance): if instance['bracket_pricing'] == 'Yes': return [1] elif instance['bracket_pricing'] == 'No': return [0] else: raise ValueError def get_quantity(instance): return [int(instance['quantity'])] def get_min_order_quantity(instance): return [int(instance['min_order_quantity'])] def get_annual_usage(instance): return [int(instance['annual_usage'])] def get_absolute_date(instance): return [365 * int(instance['quote_date'].split('-')[0]) + 12 * int(instance['quote_date'].split('-')[1]) + int(instance['quote_date'].split('-')[2])] SUPPLIERS_LIST = ['S-0058', 'S-0013', 'S-0050', 'S-0011', 'S-0070', 'S-0104', 'S-0012', 'S-0068', 'S-0041', 'S-0023', 'S-0092', 'S-0095', 'S-0029', 'S-0051', 'S-0111', 'S-0064', 'S-0005', 'S-0096', 'S-0062', 'S-0004', 'S-0059', 'S-0031', 'S-0078', 'S-0106', 'S-0060', 'S-0090', 'S-0072', 'S-0105', 'S-0087', 'S-0080', 'S-0061', 'S-0108', 'S-0042', 'S-0027', 'S-0074', 'S-0081', 'S-0025', 'S-0024', 'S-0030', 'S-0022', 'S-0014', 'S-0054', 'S-0015', 'S-0008', 'S-0007', 'S-0009', 'S-0056', 'S-0026', 'S-0107', 'S-0066', 'S-0018', 'S-0109', 'S-0043', 'S-0046', 'S-0003', 'S-0006', 'S-0097'] def get_supplier(instance): if instance['supplier'] in SUPPLIERS_LIST: supplier_index = SUPPLIERS_LIST.index(instance['supplier']) result = [0] * supplier_index + [1] + [0] * (len(SUPPLIERS_LIST) - supplier_index - 1) else: result = [0] * len(SUPPLIERS_LIST) return result def get_assembly(instance): assembly_id = int(instance['tube_assembly_id'].split('-')[1]) result = [0] * assembly_id + [1] + [0] * (25000 - assembly_id - 1) return result def get_assembly_specs(instance, assembly_to_specs): result = [0] * 100 for spec in assembly_to_specs[instance['tube_assembly_id']]: result[int(spec.split('-')[1])] = 1 return result def to_sample(instance, additional_data): return (is_bracket_pricing(instance) + get_quantity(instance) + get_min_order_quantity(instance) + get_annual_usage(instance) + get_absolute_date(instance) + get_supplier(instance) + get_assembly_specs(instance, additional_data['assembly_to_specs'])) def to_interim_label(label): return math.log(label + 1) def to_final_label(interim_label): return math.exp(interim_label) - 1 def load_additional_data(): result = dict() assembly_to_specs = dict() with open('data/competition_data/specs.csv') as input_stream: header_line = input_stream.readline() for line in input_stream: tube_assembly_id = line.split(',')[0] specs = [] for spec in line.strip().split(',')[1:]: if spec != 'NA': specs.append(spec) assembly_to_specs[tube_assembly_id] = specs result['assembly_to_specs'] = assembly_to_specs return result if __name__ == '__main__': list_of_instances, list_of_labels = load_data() print(len(list_of_instances), len(list_of_labels)) print(list_of_instances[:3]) print(list_of_labels[:3])
[generate_response.py]
import pickle import numpy import research class FinalModel(object): def __init__(self, model, to_sample, additional_data): self._model = model self._to_sample = to_sample self._additional_data = additional_data def process(self, instance): return self._model.predict(numpy.array(self._to_sample( instance, self._additional_data)).reshape(1, -1))[0] if __name__ == '__main__': with open('./data/model.mdl', 'rb') as input_stream: model = pickle.loads(input_stream.read()) additional_data = research.load_additional_data() final_model = FinalModel(model, research.to_sample, additional_data) print(final_model.process({'tube_assembly_id':'TA-00001', 'supplier':'S-0066', 'quote_date':'2013-06-23', 'annual_usage':'0', 'min_order_quantity':'0', 'bracket_pricing':'Yes', 'quantity':'1'}))
ã¢ã«ãŽãªãºã ããã¬ãŒãã³ã°ãããªããžã§ã¯ãã説æããããã€ãã®æ©èœãéžæãããŸããïŒããã¯éå±ã§ãäžèŠITããã¯çšé ãç£æ¥çšãã€ãã§ãïŒã ãããã®å
åã«åºã¥ããŠãããŒé¢æ°
to_sample()
æ©èœããŸãããçŸåšã¯æ¬¡ã®ããã«ãªã£ãŠããŸã
def to_sample(instance, additional_data): return (is_bracket_pricing(instance) + get_quantity(instance) + get_min_order_quantity(instance) + get_annual_usage(instance) + get_absolute_date(instance) + get_supplier(instance) + get_assembly_specs(instance, additional_data['assembly_to_specs']))
å
¥åã§ã¯ãã¡ã€ã³
train_set.csv
ãã¡ã€ã«ã«å«ãŸãããªããžã§ã¯ãïŒã€ã³ã¹ã¿ã³ã¹å€æ°ïŒã®èª¬æãšãããŒã¿ã»ããã®æ®ãã®ãã¡ã€ã«ã«åºã¥ããŠçæããã远å ããŒã¿ã®ã»ãããååŸããåºåã¯åºå®é·ã®é
åãè¿ãããã®åŸæ©æ¢°åŠç¿ã¢ã«ãŽãªãºã ã«ãã£ãŠå
¥åã«äŸçµŠãããŸãã
å
·äœçãªã¢ããªã³ã°ã«é¢ããŠã¯ãç¹ã«é²å±ã¯ãããŸãããScikit-Learnããã±ãŒãžã®ããã©ã«ãèšå®ãšã¯ç°ãªãèšå®ãªãã§ãåºæ¬ç·åœ¢ååž°ããŸã 䜿çšãããŠããŸãã ããã§ããåœé¢ã¯ãã¢ã«ãŽãªãºã äºæž¬ã®åè³ªãæ¹åããããã«ãæ©èœã®ãªã¹ããåŸã
ã«å¢ãããŠãããŸãã ååãç§ãã¡ã¯æ¢ã«ïŒéåžžã«ãæ¬åœã«ãããµããæ¹æ³ã§ïŒãã¬ãŒãã³ã°ããŒã¿
train_set.csv
ã¡ã€ã³ãã¡ã€ã«ãšè£å©ããŒã¿
specs.csv
ãã¡ã€ã«ã®ãã¹ãŠã®åã䜿çšããŸããã ããã§ããããããä»åºŠã¯ã远å ããŒã¿ãæã€ä»ã®ãã¡ã€ã«ã«æ³šæãæããšãã§ãã ç¹ã«ãå補åã®ã³ã³ããŒãã³ãã説æãã
bill_of_materials.csv
ãã¡ã€ã«ã®å
å®¹ã¯ææã«èŠããŸãã
æšèã®ãããªãéžæ
$ head ./data/competition_data/bill_of_materials.csv tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8 TA-00001,C-1622,2,C-1629,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00002,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00003,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00004,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00005,C-1624,1,C-1631,1,C-1641,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00006,C-1624,1,C-1631,1,C-1641,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00007,C-1622,2,C-1629,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00008,C-1312,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA TA-00009,C-1625,2,C-1632,2,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA
ã芧ã®ãšããããã¡ã€ã«åœ¢åŒã¯
specs.csv
䌌ãŠã
specs.csv
ã ãŸããå®éã«çºçããã³ã³ããŒãã³ãã®çš®é¡ã確èªããŸãããããã®æ
å ±ãå©çšããŠããããã®è£å©ããŒã¿ã«åºã¥ããŠã©ã®æ©èœã圢æããã®ã劥åœãã倿ããŸãã
>>> set_of_components = set() >>> with open('./data/competition_data/bill_of_materials.csv') as input_stream: ... header_line = input_stream.readline() ... for line in input_stream: ... for i in range(1, 16, 2): ... new_component = line.split(',')[i] ... set_of_components.add(new_component) ... >>> len(set_of_components) 2049 >>> sorted(set_of_components)[:10] ['9999', 'C-0001', 'C-0002', 'C-0003', 'C-0004', 'C-0005', 'C-0006', 'C-0007', 'C-0008', 'C-0009']
ã³ã³ããŒãã³ãã¯éåžžã«å€ããªãããšã倿ããããã屿§ãæšæºçãªæ¹æ³ã§2000ãè¶
ããèŠçŽ ã®é
åã«æ¡åŒµãããšããèãã¯ããŸãåççã§ã¯ãªãããã§ãã æãäžè¬çãªæ°åã®ãªãã·ã§ã³ã䜿çšããŠã¿ãŠãã ããããŸãããããã®ãªãã·ã§ã³ã®æ°ã«é¢äžãã倿°ãã埮調æŽã®æ®µéã§æé©åã®èª¿æŽãã©ã¡ãŒã¿ãŒãšããŠæ®ããŠã¿ãŸãããã
def load_additional_data(): result = dict() ... assembly_to_components = dict() component_to_popularity = dict() with open('./data/competition_data/bill_of_materials.csv') as input_stream: header_line = input_stream.readline() for line in input_stream: tube_assembly_id = line.split(',')[0] assembly_to_components[tube_assembly_id] = dict() for i in range(1, 16, 2): new_component = line.split(',')[i] if new_component != 'NA': quantity = int(line.split(',')[i + 1]) assembly_to_components[tube_assembly_id][new_component] = quantity if new_component in component_to_popularity: component_to_popularity[new_component] += 1 else: component_to_popularity[new_component] = 1 components_by_popularity = [value[0] for value in sorted( component_to_popularity.items(), key=operator.itemgetter(1, 0), reverse=True)] result['assembly_to_components'] = assembly_to_components result['components_by_popularity'] = components_by_popularity ... def get_assembly_components(instance, assembly_to_components, components_by_popularity, number_of_components): """ number_of_components: number of most popular components taken into account """ result = [0] * number_of_components for component in sorted(assembly_to_components[instance['tube_assembly_id']]): component_index = components_by_popularity.index(component) if component_index < number_of_components:
æ°ãããµã€ã³ãªãã§ããã°ã©ã ãå®è¡ããŠãæ€èšŒã®çµæãæãåºããŠãã ããã
Mean Squared Error: 0.7754770419953809
次ã«ãæ°ããæ©èœã远å ããŸãã
Mean Squared Error: 0.711158610883329
ã芧ã®ãšãããäºæž¬ã®å質ãåã³å€§å¹
ã«åäžããŠããŸãã ã¡ãªã¿ã«ãæåã®æ°å€èª¿æŽãã©ã¡ãŒã¿ãŒãã€ãŸãèæ
®ã«å
¥ããæãäžè¬çãªã³ã³ããŒãã³ãã®æ°ãååŸããŸããã äžè¬çãªèæ
®äºé
ã«åºã¥ããŠããã©ã¡ãŒã¿ãŒãå¢ãããšãäºæž¬ã®å質ãåäžããããšã念é ã«çœ®ããŠããããå€ããŠã¿ãŸããã-以åã«äœ¿çšãããŠããªãæ
å ±ããŸããŸãèæ
®ããããã§ãã
100: 0.711158610883329
200: 16433833.592963027
150: 0.7110152873760721
170: 19183113.422557358
160: 0.7107685953594116
165: 0.7119011633609398
168: 24813512.02303443
166: 0.7119603793730067
167: 0.7119604617354474
å€ã168ãè¶
ãããšãã©ã®ãããªå€§æšäºãçºçããããèšãã®ã¯äŸç¶ãšããŠå°é£ã§ãããã®ãããäºæž¬ã®åè³ªãæ¥æ¿ãã€æ¥æ¿ã«äœäžããŸãã ãã以å€ã®å Žåã倧ããªå€æŽã¯ãããŸããã ç§ãã¡ã®è¯å¿ãã¯ãªã¢ããããã«ã倿°ãã©ã¡ãŒã¿ãŒã®æžå°ã«ãã£ãŠäºæž¬ã®è³ªãã©ã®ããã«å€åããããèŠãŠã¿ãŸãããã
80: 0.7116311373463766
50: 0.7211560841347712
30: 0.7548570148032887
70: 0.7121518708790175

ã³ã³ããŒãã³ãã®æ°ãæžå°ããã«ã€ããŠèª€å·®ãå¢å ãã80ã160ã®å€ã§ã¯ã»ãŒäžå®ã®ãŸãŸã§ããããšãããããŸãã ããã§ã¯ããã©ã¡ãŒã¿ãŒã100ã®ãŸãŸã«ããŠãããŸãã
ã芧ã®ãšãããæ°ããæ©èœã®è¿œå ã«ãããæ€èšŒã倧å¹
ã«æ¹åãããŠããŸãã ãã®çš®ã®å®éšãããã«ããã€ãè¡ãããã®åŸããã¬ãŒãã³ã°ã¢ãã«ãšãã®ãã¬ãŒãã³ã°ãã©ã¡ãŒã¿ãŒã®ããªãšãŒã·ã§ã³ã«é²ã¿ãŸãã ããè€éãªé
çœ®ãæ€èšããŸã
ããããŒã¿ãã¡ã€ã«ã®
説æãã倿ãããšãããŒãã¡ã€ã«ã¯
tube.csv
ã§ãã
$ head data/competition_data/tube.csv tube_assembly_id,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other TA-00001,SP-0035,12.7,1.65,164,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0 TA-00002,SP-0019,6.35,0.71,137,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0 TA-00003,SP-0019,6.35,0.71,127,7,19.05,N,N,N,N,EF-008,EF-008,0,0,0 TA-00004,SP-0019,6.35,0.71,137,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0 TA-00005,SP-0029,19.05,1.24,109,4,50.8,N,N,N,N,EF-003,EF-003,0,0,0 TA-00006,SP-0029,19.05,1.24,79,4,50.8,N,N,N,N,EF-003,EF-003,0,0,0 TA-00007,SP-0035,12.7,1.65,202,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0 TA-00008,SP-0039,6.35,0.71,174,6,19.05,N,N,N,N,EF-008,EF-008,0,0,0 TA-00009,SP-0029,25.4,1.65,135,4,63.5,N,N,N,N,EF-003,EF-003,0,0,0
ãmaterial_idãåã®å
容ã«ã¯ã
specs.csv
æå®ãããå€ã
specs.csv
ãšããŠ
specs.csv
ãããŠããããšã«
specs.csv
ããŠ
specs.csv
ã ãããäœãæå³ããããèšãã®ã¯äŸç¶ãšããŠå°é£ã§ãããspecs.csvã®æåã®20è¡ã«å«ãŸããSP-xyztåã®ããã€ãã®å€ãæ€çŽ¢ããŠãäœãèŠã€ãããŸãããã念ã®ãããã®æ©èœãèŠããŠãããŠãã ããã ãŸããã³ã³ãã¹ãã®èª¬æã«åºã¥ããŠãåã«ç€ºãããŠããçŽ æãš
bill_of_materials.csv
瀺ãããŠããçŽ æã®å®éã®éããçè§£ããããšã¯å°é£
bill_of_materials.csv
ã ãã ããåœé¢ã¯ããã®ãããªæŽç·Žããã質åã«æ©ãŸãããããšã¯ãªããéåžžã®æ¹æ³ã§å¯èœãªãªãã·ã§ã³ã®æ°ãåæãããã®å€ãïŒã§ããã°ïŒã¢ã«ãŽãªãºã ã«åœ¹ç«ã€èšå·ã«å€æããããšããŸãã
>>> set_of_materials = set() >>> with open('./data/competition_data/tube.csv') as input_stream: ... header_line = input_stream.readline() ... for line in input_stream: ... new_material = line.split(',')[1] ... set_of_materials.add(new_material) ... >>> len(set_of_materials) 20 >>> set_of_materials {'SP-0034', 'SP-0037', 'SP-0039', 'SP-0030', 'SP-0029', 'NA', 'SP-0046', 'SP-0028', 'SP-0031', 'SP-0032', 'SP-0033', 'SP-0019', 'SP-0048', 'SP-0008', 'SP-0045', 'SP-0035', 'SP-0044', 'SP-0036', 'SP-0041', 'SP-0038'}
ããªãå€ãã®ãªãã·ã§ã³ãæããã«ãªã£ããããæšæºã®ã«ããŽãªå±æ§ã远å ããããšãreallyããããšã¯ã§ããŸããã ãããè¡ãã«ã¯ãæåã«
assembly_to_material
èŸæžãããŒãããŠ
load_additional_data()
颿°ã
load_additional_data()
ãŸãã
assembly_to_material = dict() with open('./data/competition_data/tube.csv') as input_stream: header_line = input_stream.readline() for line in input_stream: tube_assembly_id = line.split(',')[0] material_id = line.split(',')[1] assembly_to_material[tube_assembly_id] = material_id result['assembly_to_material'] = assembly_to_material
ãããŠãæ¢åã®é¢æ°ã®1ã€ãšåæ§ã«èšè¿°ãããšãã«äœ¿çšããŸãã
MATERIALS_LIST = ['NA', 'SP-0008', 'SP-0019', 'SP-0028', 'SP-0029', 'SP-0030', 'SP-0031', 'SP-0032', 'SP-0033', 'SP-0034', 'SP-0035', 'SP-0036', 'SP-0037', 'SP-0038', 'SP-0039', 'SP-0041', 'SP-0044', 'SP-0045', 'SP-0046', 'SP-0048'] def get_material(instance, assembly_to_material): material = assembly_to_material[instance['tube_assembly_id']] if material in MATERIALS_LIST: material_index = MATERIALS_LIST.index(material) result = [0] * material_index + [1] + [0] * (len(MATERIALS_LIST) - material_index - 1) else: result = [0] * len(MATERIALS_LIST) return result
Mean Squared Error: 0.7187098083419174
æ®å¿µãªããããã®æ©èœã¯æ¢åã®çµæïŒçŽ0.711ïŒã®æ¹åã«ã¯åœ¹ç«ã¡ãŸããã§ããã ããããããã¯åé¡ã§ã¯ãããŸãã-ç§ãã¡ã®ä»äºãç¡é§ã«ãªãããããè€éãªã¢ãã«ã®ã»ããã¢ããã«åœ¹ç«ã€ããšãé¡ãããšãã§ããŸãã åã
tube.csv
ãã¡ã€ã«ã®æ¬¡ã®åã¯ããã€ãã®çŽåŸã
tube.csv
ãŸãã ãªããžã§ã¯ãã®ãã®ããããã£ã¯
éçãšåŒã°ããæãåçŽã§æãèªç¶ãªæ¹æ³ã§ãã€ãŸãå€ãåãããšã«ããã笊å·ã«å€æãããŸãã ãã¡ãããå Žåã«ãã£ãŠã¯ãäœããã®æ¹æ³ã§æ£èŠåãŸãã¯å€æŽããããšãæçšãããããŸããããæåã®è©Šã¿ã§ã¯ããããªãã§è¡ãããšãã§ããŸãã ãã®å±æ§ã«äžèŽããã³ãŒããèšè¿°ããããšã§ãããã身è¿ãªæ¹æ³ã«ãªããŸããã
def get_diameter(instance, assembly_to_diameter): return [assembly_to_diameter[instance['tube_assembly_id']]]
to_sample()
颿°ã«è¿œå ãããšãæ€èšŒãµã³ãã«ã®ã¢ãã«äºæž¬ã®å質ãããã«åäžããŸãã
Mean Squared Error: 0.6968043166687439
ã芧ã®ãšãããæ€èšŒãµã³ãã«ã§æ°æ©èœãšæ°æ©èœãç°¡åã«æœåºããã¢ãã«äºæž¬ã®å質ããã¹ãããããšã§ãå質ã¡ããªãã¯ãåŒãç¶ãæ¹åãããŠããŸãã ãã ããäžæ¹ã§ã¯ãæ°ããæ©èœããã®æé·ã¯æ¢ã«ããã»ã©å€§ãããããŸããïŒãŸãããã€ãã¹ã®å ŽåããããŸã-ãããŠãææã®åã®äŸã®ããã«ã察å¿ããæ©èœãæåŠããå¿
èŠããããŸãïŒã仿¹ã§ã¯ãç«¶äºãäœæ¥ã§ã¯ãªããã¬ãŒãã³ã°ãããžã§ã¯ãããããŸããã®ããããã°ããã®éæ©èœã®éžæãçµäºããã¢ã«ãŽãªãºã ã®éžæãšãã©ã¡ãŒã¿ãŒã®æé©åã®åé¡ã«é²ã¿ãŸãã
ã¢ãã«ãç°ãªããŸã
çŸåšã®æ©èœã»ããã®ãã¬ãŒã ã¯ãŒã¯å
ã§ãæé©ã«è¿ãæ©æ¢°åŠç¿ã¢ãã«ãšå¯Ÿå¿ãããã€ããŒãã©ã¡ãŒã¿ãŒã»ãããèŠã€ããããšããŸãã ããŸããŸãªã¢ãã«ããã¹ãããããšããå§ããŸãããã ãŸããåçŽãªç·åœ¢ååž°ãã䜿ããããKerasããã±ãŒãžã®ãã¡ãã·ã§ããã«ãªãã¥ãŒã©ã«ãããã¯ãŒã¯ã«çœ®ãæããŠã¿ãŸãããã
from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD model = Sequential() model.add(Dense(units=30, activation='tanh', input_dim=len(list_of_samples[0]))) model.add(Dense(units=1, activation='linear')) optimizer = SGD(lr=0.1) model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
ããã¯ãã¬ã€ã€ãŒå
ã®ãã¥ãŒãã³ã®æ°ã30ã
learning_rate
ã0.1ãã¢ã¯ãã£ããŒã·ã§ã³é¢æ°
tanh
å®å
šã«æ¥ç¶ããããããã¯ãŒã¯ã¢ãŒããã¯ãã£ã«å¯Ÿå¿ããŠããŸãã åºåå±€ã§ã¯ãæ°å€ãã©ã¡ãŒã¿ãŒãäºæž¬ããããã颿°ã¯ç·åœ¢ã§ãããã·ã°ã¢ã€ãã®ãã®å€åã§ã¯ãªãããšã«æ³šæããŠãã ããã
Mean Squared Error: nan
ãšã©ãŒã¯éåžžã«å€§ãããïŒããªã倧ããïŒnumpy-ev intã«åãŸããªãããšã倿ããŸããã å¥ã®ã¬ã€ã€ãŒã远å ããŠã¿ãŸãããã
from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD model = Sequential() model.add(Dense(units=30, activation='tanh', input_dim=len(list_of_samples[0]))) model.add(Dense(units=20, activation='tanh', input_dim=len(list_of_samples[0]))) model.add(Dense(units=1, activation='linear')) optimizer = SGD(lr=0.1) model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
Mean Squared Error: nan
ç¹°ãè¿ããŸãããè¯ãçµæã¯æ©èœããŸããã§ããã æ®å¿µã§ãããã©ãããããã¥ãŒã©ã«ãããã¯ãŒã¯-ã¢ãã«ã¯äŸ¿å©ã§åªããŠããŸãããç§ãã¡ã®ã¿ã¹ã¯ã«ã¯é©ããŠããŸããã ãã ãããã¥ãŒã©ã«ãããã¯ãŒã¯ã¿ã€ãã®ã¢ãã«ã䜿çšããŠè¯ãçµæãåŸãæ¹æ³ã誰ããç¥ã£ãŠããå Žåã¯ãã³ã¡ã³ãå
ã®ã³ãŒããžã®ãªã³ã¯ãåŸ
ã£ãŠããã®ã§ãããä¿å®çãªã¢ãããŒãã«åãæ¿ããŸãã ããšãã°ãååž°åé¡ã解決ããããã®æšæºã¢ã«ãŽãªãºã ã®1ã€ã¯åŸé
ããŒã¹ãã£ã³ã°ã§ãã 䜿ã£ãŠã¿ããã
model = GradientBoostingRegressor()
Mean Squared Error: 0.44000911792278125
宣äŒããããã¥ãŒã©ã«ãããã¯ãŒã¯ã®è³ãã€ããããããªå€±æã®åŸãééããªãç§ãã¡ã®ç²Ÿç¥ãé«ãã倧ããªé²æ©ã¯æããã§ãã ããã§ã¯ããã¡ãããRidgeRegressionã®ãããªä»ã®ã¢ã«ãŽãªãºã ã詊ãããšãã§ããŸãããäžè¬çã«ãèè
ã¯åŸé
ããŒã¹ãã£ã³ã°ããã®ãããªã¿ã¹ã¯ã«é©ããŠããããšããã¥ãŒã©ã«ãããã¯ãŒã¯ãæ¬åœã«æªãããšããããŠæ®ãã®ã¢ãã«ãå€ããå°ãªããã ãšç¥ã£ãŠããã®ã§ã説æããŸãããã¹ãŠã®å¯èœãªãªãã·ã§ã³ãããã³ãããã®æé©ãªãã€ããŒãã©ã¡ãŒã¿ãŒãã€ãŸãããŒã¹ããæé©åããŸãã
ããã«ããScikit-Learnã©ã€ãã©ãªå°çšã®ãµã€ãã®å¯Ÿå¿ããããŒãžã«ç§»å
ããããåã«ã³ã³ãœãŒã«ã«ãã«ãïŒGradientBoostingRegressorïŒãšå
¥åãããšããã®ã¢ã«ãŽãªãºã ã®å®è£
ã«ã¯æ¬¡ã®ãã¥ãŒãã³ã°ãã©ã¡ãŒã¿ãŒãšãã®ããã©ã«ãå€ã®ã»ãããããããšãããããŸãã
loss='ls'
learning_rate=0.1
n_estimators=100
subsample=1.0
criterion='friedman_mse'
min_samples_split=2
min_samples_leaf=1
min_weight_fraction_leaf=0.0
max_depth=3
min_impurity_decrease=0.0
min_impurity_split=None
init=None
random_state=None
max_features=None
alpha=0.9
verbose=0
max_leaf_nodes=None
warm_start=False
presort='auto'
ãããã1ã€ãã€åæããŠããããã倿ŽããŠã¿ãŸããããäžèŠãããšããã®å€åã¯äºæž¬ã®åè³ªãæ¹åããã®ã«åœ¹ç«ã¡ãŸãã
| loss : {'ls', 'lad', 'huber', 'quantile'}, optional (default='ls') | loss function to be optimized. 'ls' refers to least squares | regression. 'lad' (least absolute deviation) is a highly robust | loss function solely based on order information of the input | variables. 'huber' is a combination of the two. 'quantile' | allows quantile regression (use `alpha` to specify the quantile).
æé©åããæå€±é¢æ°ã åé¡ã®åå®åŒååŸïŒã©ãã«ãã察æ°ãååŸããããã«å¿ããŠLMSEã®ä»£ããã«MSEãæé©åãããšããç¹ã§ïŒãããã©ã«ãã®ãã©ã¡ãŒã¿ãŒã¯ã¿ã¹ã¯ã«å¯Ÿå¿ããŠããããã«èŠããŸãã ãã®ãŸãŸã«ããŠãããŸãã
| learning_rate : float, optional (default=0.1) | learning rate shrinks the contribution of each tree by `learning_rate`. | There is a trade-off between learning_rate and n_estimators.
å€ãã®ã¿ã¹ã¯ã§ãæ§æããéèŠãªãã©ã¡ãŒã¿ãŒã ä»ã®å€ã§äœãèµ·ãããèŠãŠã¿ãŸãããã
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.100} ... model = GradientBoostingRegressor(learning_rate=MODEL_SETTINGS['learning_rate'])
ãã€ããŒãã©ã¡ãŒã¿ãŒã®åçŽã§æç€ºçãªå®£èšã¯ããã®ãããªãã粟巧ãªåœ¢åŒã§èšè¿°ãããŠãããããå°æ¥çã«ã¯ãã¹ã¯ãªããã®ããããŒå
ã®ã¢ãã«ãã©ã¡ãŒã¿ãŒïŒããã³ã¢ãã«èªäœïŒã倿ŽãããšäŸ¿å©ã§ãã
Mean Squared Error: 0.44002379237806705
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.200}
Mean Squared Error: 0.41423518862618164
ããã
learning_rate
ãå¢å ãã
learning_rate
ãå質ã¡ããªãã¯ã倧å¹
ã«æ¹åãããŸãã ãã®ãã©ã¡ãŒã¿ãŒãåãæ¹åã«å€æŽãç¶ããŠã¿ãŸãããã
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.300}
Mean Squared Error: 0.4051555356961356
ãŸã å¢ãããŸããã
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.500}
Mean Squared Error: 0.39668129369369115
ãã®ä»
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':1.000}
Mean Squared Error: 0.434184026080522
次ã®ãã©ã¡ãŒã¿ã®å¢å ã«ãããã¿ãŒã²ããã¡ããªãã¯ã¯æªåããŸããã äžéå€ã®äžããäœããæ¢ããŠã¿ãŸãããã
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.700}
Mean Squared Error: 0.39998809063954305
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.600}
Mean Squared Error: 0.4032676539076024
å®éšã§åŸãããããŒã¿ã¯ãã§ã«éåžžã«çŽããããã®ã§ããããã衚ã«ãŸãšããŠé©åãªã°ã©ããæããŸãããïŒå人çã«ã¯ãç§ã¯éåžžããã®ãããªã°ã©ããèŠãŠããªãã®ã§ã粟ç¥çãªæ§é ã«å¶éãããŸãããããã¯æè²ç®çã«åœ¹ç«ã¡ãŸãïŒã
learning_rate | MSE |
---|
0.100 | 0.4400 |
0.200 | 0.4142 |
0.300 | 0.4051 |
0.500 | 0.3966 |
0.600 | 0.4032 |
0.700 | 0.3999 |
1,000 | 0.4341 |

äžèŠãããšã
learning_rate
çã0.100ãã0.500ã«æžå°ããããšã§å質ã¡ããªãã¯ãæ¹åããçŽ0.700ãŸã§æ¯èŒçå®å®ãããŸãŸã«ãªãããã®åŸæªåããããã«èŠããŸãã ããã«å€ãã®å®éšã§ãã®ä»®èª¬ããã¹ãããŸãã
MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.400}
Mean Squared Error: 0.40129637223972486
MODEL_SETTINGS = { 'model_name': 'GradientBoostingRegressor', 'learning_rate': 0.800}
Mean Squared Error: 0.4253214442400451
MODEL_SETTINGS = { 'model_name': 'GradientBoostingRegressor', 'learning_rate': 0.550}
Mean Squared Error: 0.39587242367884334
MODEL_SETTINGS = { 'model_name': 'GradientBoostingRegressor', 'learning_rate': 0.650}
Mean Squared Error: 0.40041838873950636
learning_rate | MSE |
---|
0.100 | 0.4400 |
0.200 | 0.4142 |
0.300 | 0.4051 |
0.400 | 0.4012 |
0.500 | 0.3966 |
0.550 | 0.3958 |
0.600 | 0.4032 |
0.650 | 0.4004 |
0.700 | 0.3999 |
0.800 | 0.4253 |
1,000 | 0.4341 |

æé©å€ã¯0.500ãã0.550ã®ç¯å²ã«ãããããããã®æ¹åã®å€åã¯ãæŠããŠãã¢ãã«ã®ãã©ã¡ãŒã¿ãŒãŸãã¯å±æ§ãªã¹ãã®ä»ã®å¯èœãªå€åãšæ¯èŒããŠãæçµã¡ããªãã¯ã«ã»ãšãã©åæ ãããŠããªãããã§ãã learning_rateã0.550ã«ä¿®æ£ããã¢ãã«ã®ä»ã®ãã©ã¡ãŒã¿ãŒã«æ³šæãæããŸãã
ãšããã§ãåæ§ã®å Žåããã€ããŒãã©ã¡ãŒã¿ãŒã®1ã€ãŸãã¯å¥ã®ã»ãããæ£ããéžæãããããã«ããããã«ãå€ãã®ã¢ã«ãŽãªãºã ã§äœ¿çšå¯èœãªrandom_stateãã©ã¡ãŒã¿ãŒã倿Žãããããµã³ãã«ããããã®èŠçŽ ã®æ°ãç¶æããªãããã¬ãŒãã³ã°ãšæ€èšŒã«åå²ããããšã圹ç«ã¡ãŸãã ããã«ããããã€ããŒãã©ã¡ãŒã¿ãŒèšå®ãšæ€èšŒãµã³ãã«ã®äºæž¬ã®è³ªãšã®éã«æç¢ºãªãã¿ãŒã³ããªãå Žåã«ãã¢ã«ãŽãªãºã ã®å®éã®æå¹æ§ã«é¢ãã詳现æ
å ±ãåéã§ããŸãã
| n_estimatorsïŒintïŒããã©ã«ã= 100ïŒ
| å®è¡ããããŒã¹ãã£ã³ã°ã¹ããŒãžã®æ°ã åŸé
ããŒã¹ãã£ã³ã°
| éå°é©åã«å¯ŸããŠããªãå
ç¢ã§ãããããéåžžã¯å€æ°
| ããã©ãŒãã³ã¹ãåäžããŸãã
åŸé
ããŒã¹ãã£ã³ã°ã¢ãã«ãåŠç¿ããéã®ç¹å®ã®ãããŒã¹ãã£ã³ã°ã¹ããŒãžãã®æ°ã æ£çŽã«èšããšãèè
ã¯ã¢ã«ãŽãªãºã ã®æ£åŒãªå®çŸ©ã§åœŒããã©ã®ãããªåœ¹å²ãæãããŠããããæ¢ã«å¿ããŠããŸããããããåãäžããŠãã®ãã©ã¡ãŒã¿ãŒããã¹ãããŸãããã ãšããã§ãã¢ãã«ãã¬ãŒãã³ã°ã«è²»ãããæéãæ£ç¢ºã«ç¹å®ãå§ããŸãã
import time MODEL_SETTINGS = { 'model_name':'GradientBoostingRegressor', 'learning_rate':0.550, 'n_estimators':100} model = GradientBoostingRegressor(learning_rate=MODEL_SETTINGS['learning_rate'], n_estimators=MODEL_SETTINGS['n_estimators']) time_start = time.time() model.fit(numpy.array(train_samples), numpy.array(train_labels)) print('Time spent: {0}'.format(time.time() - time_start)) print('MODEL_SETTINGS = {{\n {0}\n {1}\n {2}}}' .format(MODEL_SETTINGS['model_name'], MODEL_SETTINGS['learning_rate'], MODEL_SETTINGS['n_estimators']))
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100}
Time spent: 71.83099746704102
Mean Squared Error: 0.39622103688045596
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 200}
Time spent: 141.9290111064911
Mean Squared Error: 0.40527237378150016
ã芧ã®ãšããããã¬ãŒãã³ã°ã«è²»ããããæéã¯å€§å¹
ã«å¢å ããåè³ªææšã¯æªåããŠããã ãã§ãã念ã®ãããããäžåºŠãã©ã¡ãŒã¿ãŒãå¢ãããŠã¿ãŠãã ããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 300}
Time spent: 204.2548701763153
Mean Squared Error: 0.4027642069054909
é£ããã§ãããæç¢ºã§ã¯ãããŸããããã¹ãŠãããŒã«ããã¯ããŠãä»ã®æé·ãã€ã³ããæ¢ããŠã¿ãŸãããã| max_depthïŒæŽæ°ããªãã·ã§ã³ïŒããã©ã«ã= 3ïŒ| åã
ã®ååž°æšå®ã®æå€§æ·±åºŠãæå€§| depthã¯ãããªãŒå
ã®ããŒãã®æ°ãå¶éããŸãããã®ãã©ã¡ãŒã¿ãŒã調æŽãã| æé«ã®ããã©ãŒãã³ã¹ãåŸãããã«ãæé©ãªå€ã¯çžäºäœçšã«äŸåããŸã| å
¥å倿°ã®ã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 100 'max_depth': 3}
Time spent: 66.88031792640686
Mean Squared Error: 0.39713231957974565
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 100 'max_depth': 4}
Time spent: 86.24338245391846
Mean Squared Error: 0.40575622943301354
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 100 'max_depth': 2}
Time spent: 45.39022421836853
Mean Squared Error: 0.41356622455188463
æ®å¿µãªããããããå©ãã«ã¯ãªããŸãããå
ã®å€ã«æ»ã£ãŠå
ã«é²ã¿ãŸãã|åºæºïŒæååããªãã·ã§ã³ïŒããã©ã«ã= "friedman_mse"ïŒ|ã¹ããªããã®åè³ªãæž¬å®ããæ©èœããµããŒããããŠããåºæº|ã¯æ¹åãããå¹³åäºä¹èª€å·®ã®ãfriedman_mseãã§ããããªãŒããã³ã«ããã¹ã³ã¢ããmseãã¯å¹³åäºä¹èª€å·®ããmaeãã¯|å¹³å絶察誀差ã ãfriedman_mseãã®ããã©ã«ãå€ã¯||ã§ããè¯ãè¿äŒŒãæäŸã§ãããããäžè¬çã«æé©ã§ããããã€ãã®ã±ãŒã¹ããã®ãã©ã¡ãŒã¿ãŒã®å€æŽãã¢ãã«ã®æ¹åã«åœ¹ç«ã£ãããšãèŠããŠããŸãããVorontsovãè¬æŒã§ãåé¢åºæºãå®éã«å€æŽããŠãæçµã¢ãã«ã®å質ã«ã¯åœ±é¿ããªããšèšã£ãã®ã§ãã¹ãããããŠå
ã«é²ã¿ãŸãã| min_samples_splitïŒintãfloatããªãã·ã§ã³ïŒããã©ã«ã= 2ïŒ| å
éšããŒããåå²ããããã«å¿
èŠãªãµã³ãã«ã®æå°æ°ïŒ|| -intã®å Žåã `min_samples_split`ãæå°æ°ãšèŠãªããŸãã| -ãããŒãã®å Žåã `min_samples_split`ã¯ããŒã»ã³ããŒãžã§ã| `ceilïŒmin_samples_split * n_samplesïŒ`ã¯æå°å€ã§ããåã¹ããªããã®ãµã³ãã«æ°ã|| ... versionchanged :: 0.18| ããŒã»ã³ããŒãžã®ãããŒãå€ã远å ããŸããããã¬ãŒãã³ã°äžã«ç¹å®ã®é ç¹ã§ããªãŒãæ§ç¯ãç¶ããããã«å¿
èŠãªãµã³ãã«ã®æå°æ°ã倿ŽããŠã¿ãŸãããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 100 'max_depth': 3 'min_samples_split': 2 } <source> <code>Time spent: 66.22262406349182</code> <code>Mean Squared Error: 0.39721489877049687</code> - , , . . <source lang="python"> MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3 , 'min_samples_split': 3 }
Time spent: 66.18473935127258
Mean Squared Error: 0.39493122173406714
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor 'learning_rate': 0.55 'n_estimators': 100 'max_depth': 3 'min_samples_split': 8 }
Time spent: 66.7643404006958
Mean Squared Error: 0.3982469042761572
é²è¡ã¯çµãã£ãã®ã§ãã¯ã£ãããšå§ãŸããŸããã§ãããå®å
šãæããããäžéå€4ã䜿çšããŠã¿ãŠãã ããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 }
Time spent: 66.75952744483948
Mean Squared Error: 0.3945186290058591
3ã®å ŽåãããããªãåªããŠããŸãããããã§ãããã¬ãŒãã³ã°æéã«éãããªãããããã®å€ã®ãŸãŸã«ããŠãããŸãã| min_samples_leafïŒintãfloatããªãã·ã§ã³ïŒããã©ã«ã= 1ïŒ|ãªãŒãããŒãã«ããããã«å¿
èŠãªãµã³ãã«ã®æå°æ°ïŒ|| -intã®å Žåã `min_samples_leaf`ãæå°æ°ãšèŠãªããŸãã| -ãããŒãã®å Žåã `min_samples_leaf`ã¯ããŒã»ã³ããŒãžã§ã| `ceilïŒmin_samples_leaf * n_samplesïŒ`ã¯æå°å€ã§ããåããŒãã®ãµã³ãã«æ°ã|| ... versionchanged :: 0.18|ããŒã»ã³ããŒãžã®ãããŒãå€ã远å ããŸããããã¬ãŒãã³ã°åŸã«ããªãŒã®èã«ã§ãããµã³ãã«ã®æå°æ°ããã®ãã©ã¡ãŒã¿ãŒã倧ãããããšããã¬ãŒãã³ã°ãµã³ãã«ã®äºæž¬ã®å質ãäœäžããŸãïŒå€ãå°ããã»ã©ãã¢ã³ãµã³ãã«ãæ§æããããªãŒã¯ãã¬ãŒãã³ã°ãµã³ãã«ã®ç¹å®ã®åäŸã«ããé©åããŠããããïŒããŸããéãè¯ããã°ãæ€èšŒãµã³ãã«ã®å質ãåäžããŸããã€ãŸããå°ãªããšãçè«çã«ã¯ãåèšç·ŽãšæŠãã®ã«åœ¹ç«ã¡ãŸãã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 1}
Time spent: 68.58824563026428
Mean Squared Error: 0.39465027476703846
ãããã©ã«ãããã©ã¡ãŒã¿ã代å
¥ãããšãã®äºæž¬ã¯ãåã®å®éšã®å€ãšã»ãŒåãã§ããããã¯ããã¹ãŠãæ£åžžã«æ©èœããŠããããšãæå³ããŸãããããŸã§ã®ç°¡åãªæ€èšŒæé ã§ã¯ãã¢ã«ãŽãªãºã ã®äºæž¬ã®å質ã«çã«å¯Ÿå¿ããå€ãè¿ããããã©ã¡ãŒã¿ãŒã®éå§å€ãæ··åããŸããã§ããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 2}
Time spent: 68.03447198867798
Mean Squared Error: 0.39707533548242
ãã©ã¡ãŒã¿ãŒã1ãã2ã«å¢å ãããšãã¢ã«ãŽãªãºã ã®å質ã¡ããªãã¯ã®äœäžãèŠãããŸããããã§ããããäžåºŠå¢ãããŠã¿ãŠãã ããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 3, 'random_seed': 0}
Time spent: 66.98832631111145
Mean Squared Error: 0.39419555554861274
ã¿ãŒã²ããã¡ããªãã¯ãæªåãããåŸããã©ã¡ãŒã¿ãé£ç¶ããŠ2åå¢ãããšããã®æ¹åãèŠãããå
ã®å€ãè¶
ããããšããçãããã§ããããå°ãé«ããçŸåšã®æ€èšŒæé ãäºæž¬ã®è³ªã®å°ãªããšã4æ¡ãŸã§ã®é©åãªè©äŸ¡ãæäŸãããšããäºå®ãæ¯æãã匷åãªèšŒæ ãåãåããŸãããrandom_seedãã©ã¡ãŒã¿ãŒã倿Žããããšãã«äœãèµ·ãããã確èªããŸãããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 3, 'random_seed': 1}
Time spent: 67.16857171058655
Mean Squared Error: 0.39483997966302
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 3, 'random_seed': 2}
Time spent: 66.11015605926514
Mean Squared Error: 0.39492203941997045
å°ãªããšãäžèšã®å°æ°ç¹ä»¥äž4æ¡ãŸã§ãæ€èšŒæé ã¯å®éã«ã¢ãã«ã®é©åãªè©äŸ¡ãæäŸããããã§ãããããã£ãŠã次ã®ãè¯å¿ãæããã«ããããã¹ãã¯ãé©ãã¹ãããšã«ããã€ããŒãã©ã¡ãŒã¿ãŒã®å€ã«å¿ããŠéå調ã«å質ã確èªããã®ã«åœ¹ç«ã¡ãŸãããåã³å¢ããããšã¯äŸ¡å€ããããŸãã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 4 , 'random_seed': 0}
Time spent: 66.96864414215088
Mean Squared Error: 0.39725274882841366
åã³æªåããŸãããã¶ãäºå®ã¯ãããã€ãã®éæ³ã®çç±ã§ããã©ã¡ãŒã¿ãŒã®å€ãå¶æ°ã®å Žåãäºæž¬ã®å質ã奿°ã®ãã®ããããããã«æªããšããããšã§ããïŒèª°ãç¥ã£ãŠããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 5 , 'random_seed': 0}
Time spent: 66.33412432670593
Mean Squared Error: 0.39348528600652666
MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4 'min_samples_leaf': 5 , 'random_seed': 1}
Time spent: 66.22624254226685
Mean Squared Error: 0.3935675331843957
å質ãå°ãåäžããŸããããããŠãå¶æ°ã®å€ãæã€ææªã®äºæž¬å質ã«é¢ããã仮説ãã¯ãå¥ã®ç¢ºèªãåããŸãããããããããã¯ãã®ãããªäºæ
ã§ã¯ãããŸããã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 6 , 'random_seed': 0}
Time spent: 66.88769054412842
Mean Squared Error: 0.38855940004423717
ããã§ãã5ãã6ã«åãæ¿ãããšãå質ã¯ãããã«åäžããŸãããã¹ããŒã¹ãç¯çŽããããã«ãå®éšã®å®å
šãªãã°ãã¹ãããããããã«çµæãããŒãã«ã«è»¢éããŠãé©åãªã¹ã±ãžã¥ãŒã«ãçæããŸããmin_samples_leaf | MSE | éãããæé |
---|
1 | 0.3946 | 68.58 |
2 | 0.3970 | 68.03 |
3 | 0.3941 | 66.98 |
4 | 0.3972 | 66.96 |
5 | 0.3934 | 66.33 |
6 | 0.3885 | 66.88 |
7 | 0.3895 | 65.22 |
8 | 0.3903 | 65.89 |
9 | 0.3926 | 66.31 |
ååŸããæ€èšŒçµæã«é«ãä¿¡é Œæ§ããããããmin_samples_leafã®æé©å€ã¯6ã§ãããšæ³å®ã§ããŸããæ¬¡ã®ãã©ã¡ãŒã¿ãŒã䜿çšããå®éšã«é²ã¿ãŸãããã| min_weight_fraction_leafïŒfloatããªãã·ã§ã³ïŒããã©ã«ã= 0ïŒ|ãªãŒãããŒãã«ããå¿
èŠãããïŒãã¹ãŠã®å
¥åãµã³ãã«ã®ïŒéã¿ã®åèšã®æå°éã¿ä»ãå²åããµã³ãã«ãæã£ãŠããŸã| sample_weightãæå®ãããŠããªãå Žåãçããéã¿ãã·ãŒãã圢æããããã«å¿
èŠãªäŸã®æå°éšåãããã©ã«ãã§ã¯ããããã®æ°ã¯ãŒãã§ããã€ãŸããå¶éã¯èšå®ãããŠããŸãããçè«çã«ã¯ããã®ãã©ã¡ãŒã¿ãŒã®å€ã倧ãããããšãmean_samples_leafãã©ã¡ãŒã¿ãŒã®ããã«åãã¬ãŒãã³ã°ã鲿¢ãããŸãã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 6 , 'random_seed': 0, 'min_weight_fraction_leaf': 0.01}
Time spent: 68.06336092948914
Mean Squared Error: 0.41160143391833687
æ®å¿µãªãããäºæž¬ã®è³ªã¯æªåããŠããŸããããããç§ãã¡ã¯ããŸãã«ãéèŠèŠããããã®ã§ããããïŒ MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 6 , 'random_seed': 0, 'min_weight_fraction_leaf': 0.001}
Time spent: 67.03254532814026
Mean Squared Error: 0.39262469473669265
ç¹°ãè¿ããŸãããæžå°ã¯ç§ãã¡ãå©ããŸããã§ãããäžè¬çã«ããã®ãã©ã¡ãŒã¿ãŒã®èª¬æãã倿ãããšãåºå®ãµã€ãºããµã³ããªã³ã°ããå Žåãæå®ãããmin_samples_leafãŸãã¯æå®ãããmin_weight_fraction_leafã®ããããã®å¶éãé©çšãããŸãããã®ãŸãŸã«ããŠã次ãžé²ãã§ãã ããã|ãµããµã³ãã«ïŒãããŒãããªãã·ã§ã³ïŒããã©ã«ã= 1.0ïŒ|åã
ã®ããŒã¹ã®é©åã«äœ¿çšããããµã³ãã«ã®å²å|åŠç¿è
ã 1.0ããå°ããå Žåãããã¯ç¢ºççåŸé
ã«ãªããŸããããŒã¹ãã£ã³ã°ãsubsampleãã¯ãã©ã¡ãŒã¿ãŒãn_estimatorsããšå¯Ÿè©±ããŸãã| ããµããµã³ãã«<1.0ããéžæãããšãåæ£ãæžå°ããŸãããã€ã¢ã¹ã®å¢å ã MODEL_SETTINGS = { 'model_name': GradientBoostingRegressor, 'learning_rate': 0.55, 'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 6, 'random_seed': 0, 'min_weight_fraction_leaf': 0.0, 'subsample': 0.9}
Time spent: 155.24894833564758
Mean Squared Error: 0.39231319253775626
åã«ãã©ã¡ãŒã¿ã倿Žããã ãã§ã¯ãã¯ã圹ã«ç«ããªãããã«èŠããŸãïŒãããŠãã»ãšãã©åœ¹ã«ç«ããªããã®ãšæ³å®ã§ããŸãïŒããã®ã¹ããããå®äºããããã«ããªã¯ãšã¹ããåŠçããæåŸã®æéã¯ã©ã¹ã䜿çšããŠkaggleã®éä¿¡ãçæããŸãããããã¡ãããkaggleã§åé¡ã解決ãããšãããªãã·ã§ã³ã¯éåžžããã·ã³ãã«ã«äœ¿çšãããŸãããå®éã®ã¢ããªã±ãŒã·ã§ã³ã§çºçããç¶æ³ãã·ãã¥ã¬ãŒãããååæžããã¯ã©ã¹ããã¹ãããããã«ã远å ã®ã³ãŒããæžãã®ã«å°ãæéãè²»ããããšãã§ããŸããäºæž¬ãã¡ã€ã«ãçæãã
ãã¿ãã¬ã®äžã«ã¯generate_response.pyã¹ã¯ãªããã®å€æŽããããŸããã¹ã¯ãªããä¿®æ£[generate_response.py]
import pickle import numpy import research class FinalModel(object): def __init__(self, model, to_sample, additional_data): self._model = model self._to_sample = to_sample self._additional_data = additional_data def process(self, instance): return self._model.predict(numpy.array(self._to_sample( instance, self._additional_data)).reshape(1, -1))[0] if __name__ == '__main__': with open('./data/model.mdl', 'rb') as input_stream: model = pickle.loads(input_stream.read()) additional_data = research.load_additional_data() final_model = FinalModel(model, research.to_sample, additional_data)
ã¹ã¯ãªããã¯ãå
¥åããŒã¿ãšããŠãã¹ã¯ãªããã«ãã£ãŠçæãããã¢ãã«ãšãã³ã³ãã¹ãã®äž»å¬è
ã«ãã£ãŠæäŸãããã¢ãŒã«ã€ãã«ããresearch.py
ãã¹ãããŒã¿ãå«ããã¡ã€ã«ã䜿çšããŸããåºå£ã§åœŒã¯ããã¡ã€ã«gereriruet output.csvããã¡ã€ã«å
ã®äºæž¬ã®åœ¢æã«é¢é£ãã倿Žããªãããšã¯ããã¬ãŒãã³ã°ããã³ãµãŒãã³ã°ãã€ãã©ã€ã³ã®æè»æ§ã瀺ããŠããŸããå€ãã®æ°æ©èœãããã¯ã¢ããããã¢ãã«ã倿Žãã以åã¯äœ¿çšãããŠããªãã£ãæ°ããããŒã¿ãã¡ã€ã«ã䜿çšããŸããããã¢ãã«ã®ã¢ããªã±ãŒã·ã§ã³ã«é¢é£ããã³ãŒãã®éšåã¯åºæ¬çã«å€æŽãããŠããŸãããtest_set.csv
generate_response.py
kaggleã³ã³ããã£ã·ã§ã³ã¯ãã§ã«çµäºããŠããŸããããã©ãããã©ãŒã ã®è©äŸ¡ãæ€èšŒã®æåŸ
ã«ã©ã®ããã«äžèŽããããããã§ç¢ºèªã§ããŸãã
çµæã¯ãäžèŠãå
æ°ã¥ããããŸã-äºæž¬ã®åè³ªã¯æ€èšŒãããããã«é«ãããšã倿ããŸããããã ããäžæ¹ã§ã¯ããããªãã¯ãªãŒããŒããŒãã®çµæã¯æ€èšŒã®çµæãããéåžžã«é«ãã仿¹ã§ã¯ãå
šäœçãªé äœã«ãããïŒç«¶äºãçµäºããç¬éã«æ¯ã¹ãŠïŒç§ãã¡ã®äœçœ®ã¯ãŸã é«ããããŸããããã ãããããã®åé¡ã®äž¡æ¹ã«å¯Ÿãã解決çã¯å¥ã®æ©äŒã«ãä»»ãããŸãããããã«
ããã§ãäºæž¬ã¢ã«ãŽãªãºã ãæé©åããããŸããŸãªã¢ãã«ã詊ãããã®ãããªåºã宣äŒãããŠãããã¥ãŒã©ã«ãããã¯ãŒã¯ãã¿ã¹ã¯ã«ããŸãé©ããŠããªãããšãçºèŠãããã€ããŒãã©ã¡ãŒã¿ãŒã®éžæã詊ããæçµçã«kaggleã®æåã®äºæž¬ãã¡ã€ã«ãäœæããŸããã1ã€ã®èšäºã«ã€ããŠã¯ããã»ã©ã§ã¯ãããŸããããŸããäŒãããŸããããå°ããªããããã¯, , . HR- . , , , , ( ). , , . , , , , - , . , . ( â ) , , , .