```python
import pandas as pd
import numpy as np
import json
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from sklearn. metrics import roc_auc_score
from sklearn. model_selection import StratifiedKFold
class Config ( ) : seed= 2024 num_folds= 10 TARGET_NAME = 'label'
import random
def seed_everything ( seed) : np. random. seed( seed) random. seed( seed)
seed_everything( Config. seed) path= '/kaggle/input/'
with open ( path+ "whoiswho-ind-kdd-2024/IND-WhoIsWho/train_author.json" ) as f: train_author= json. load( f)
with open ( path+ "whoiswho-ind-kdd-2024/IND-WhoIsWho/pid_to_info_all.json" ) as f: pid_to_info= json. load( f)
with open ( path+ "whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author.json" ) as f: valid_author= json. load( f) with open ( path+ "whoiswho-ind-kdd-2024/IND-WhoIsWho/ind_valid_author_submit.json" ) as f: submission= json. load( f) train_feats= [ ]
labels= [ ]
for id , person_info in train_author. items( ) : for text_id in person_info[ 'normal_data' ] : feat= pid_to_info[ text_id] try : train_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , int ( feat[ 'year' ] ) ] ) except : train_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , 2000 ] ) labels. append( 1 ) for text_id in person_info[ 'outliers' ] : feat= pid_to_info[ text_id] try : train_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , int ( feat[ 'year' ] ) ] ) except : train_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , 2000 ] ) labels. append( 0 )
train_feats= np. array( train_feats)
labels= np. array( labels)
print ( f"train_feats.shape: { train_feats. shape} ,labels.shape: { labels. shape} " )
print ( f"np.mean(labels): { np. mean( labels) } " )
train_feats= pd. DataFrame( train_feats)
train_feats[ 'label' ] = labels
train_feats. head( ) valid_feats= [ ]
for id , person_info in valid_author. items( ) : for text_id in person_info[ 'papers' ] : feat= pid_to_info[ text_id] try : valid_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , int ( feat[ 'year' ] ) ] ) except : valid_feats. append( [ len ( feat[ 'title' ] ) , len ( feat[ 'abstract' ] ) , len ( feat[ 'keywords' ] ) , len ( feat[ 'authors' ] ) , len ( feat[ 'keywords' ] ) , 2000 ] )
valid_feats= np. array( valid_feats)
print ( f"valid_feats.shape: { valid_feats. shape} " )
valid_feats= pd. DataFrame( valid_feats)
valid_feats. head( ) choose_cols= [ col for col in valid_feats. columns]
def fit_and_predict ( model, train_feats= train_feats, test_feats= valid_feats, name= 0 ) : X= train_feats[ choose_cols] . copy( ) y= train_feats[ Config. TARGET_NAME] . copy( ) test_X= test_feats[ choose_cols] . copy( ) oof_pred_pro= np. zeros( ( len ( X) , 2 ) ) test_pred_pro= np. zeros( ( Config. num_folds, len ( test_X) , 2 ) ) skf = StratifiedKFold( n_splits= Config. num_folds, random_state= Config. seed, shuffle= True ) for fold, ( train_index, valid_index) in ( enumerate ( skf. split( X, y. astype( str ) ) ) ) : print ( f"name: { name} ,fold: { fold} " ) X_train, X_valid = X. iloc[ train_index] , X. iloc[ valid_index] y_train, y_valid = y. iloc[ train_index] , y. iloc[ valid_index] model. fit( X_train, y_train, eval_set= [ ( X_valid, y_valid) ] , callbacks= [ log_evaluation( 100 ) , early_stopping( 100 ) ] ) oof_pred_pro[ valid_index] = model. predict_proba( X_valid) test_pred_pro[ fold] = model. predict_proba( test_X) print ( f"roc_auc: { roc_auc_score( y. values, oof_pred_pro[ : , 1 ] ) } " ) return oof_pred_pro, test_pred_pro
lgb_params= { "boosting_type" : "gbdt" , "objective" : "binary" , "metric" : "auc" , "max_depth" : 12 , "learning_rate" : 0.05 , "n_estimators" : 3072 , "colsample_bytree" : 0.9 , "colsample_bynode" : 0.9 , "verbose" : - 1 , "random_state" : Config. seed, "reg_alpha" : 0.1 , "reg_lambda" : 10 , "extra_trees" : True , 'num_leaves' : 64 , "verbose" : - 1 , "max_bin" : 255 , } lgb_oof_pred_pro, lgb_test_pred_pro= fit_and_predict( model= LGBMClassifier( ** lgb_params) , name= 'lgb' )
test_preds= lgb_test_pred_pro. mean( axis= 0 ) [ : , 1 ] cnt= 0
for id , names in submission. items( ) : for name in names: submission[ id ] [ name] = test_preds[ cnt] cnt+= 1
with open ( 'baseline.json' , 'w' , encoding= 'utf-8' ) as f: json. dump( submission, f, ensure_ascii= False , indent= 4 )