Spaces:

deedrop1140
/

MachineLearningAlgorithms

Running

File size: 46,771 Bytes

0d00d62

{
  "title": "LightGBM Mastery: Part 1 (Q1–25)",
  "description": "Questions 1–25 covering LightGBM fundamentals — boosting basics, leaf-wise growth, histogram optimization, and key parameters.",
  "questions": [
    {
      "id": 1,
      "questionText": "What type of algorithm is LightGBM based on?",
      "options": [
        "Bagging",
        "Boosting",
        "Stacking",
        "Voting"
      ],
      "correctAnswerIndex": 1,
      "explanation": "LightGBM is a boosting algorithm, specifically a gradient boosting framework that builds models sequentially."
    },
    {
      "id": 2,
      "questionText": "Which of the following is a unique characteristic of LightGBM?",
      "options": [
        "It grows trees level-wise",
        "It grows trees leaf-wise",
        "It uses deep neural networks",
        "It averages models"
      ],
      "correctAnswerIndex": 1,
      "explanation": "LightGBM grows trees leaf-wise (best-first) to reduce loss more efficiently compared to level-wise methods."
    },
    {
      "id": 3,
      "questionText": "What type of trees does LightGBM primarily use?",
      "options": [
        "Shallow random trees",
        "Deep neural trees",
        "Decision trees",
        "Regression trees"
      ],
      "correctAnswerIndex": 3,
      "explanation": "LightGBM primarily uses regression trees for both classification and regression tasks."
    },
    {
      "id": 4,
      "questionText": "Which technique allows LightGBM to handle large datasets efficiently?",
      "options": [
        "Feature hashing",
        "Histogram-based algorithm",
        "Random sampling",
        "PCA compression"
      ],
      "correctAnswerIndex": 1,
      "explanation": "LightGBM uses a histogram-based algorithm to reduce computation by discretizing continuous features into bins."
    },
    {
      "id": 5,
      "questionText": "Scenario: You have very large dataset with millions of rows. Why is LightGBM preferred over XGBoost?",
      "options": [
        "Because it uses neural networks internally",
        "Because it uses histogram-based splits and leaf-wise growth for efficiency",
        "Because it reduces model interpretability",
        "Because it doesn’t require gradient computation"
      ],
      "correctAnswerIndex": 1,
      "explanation": "LightGBM is optimized for large datasets using histogram-based splits and leaf-wise tree growth."
    },
    {
      "id": 6,
      "questionText": "Which LightGBM parameter controls the number of leaves in a single tree?",
      "options": [
        "num_leaves",
        "max_depth",
        "min_child_samples",
        "n_estimators"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The num_leaves parameter sets the maximum number of leaves in one tree, controlling model complexity."
    },
    {
      "id": 7,
      "questionText": "What happens if num_leaves is set too high?",
      "options": [
        "Model becomes underfit",
        "Model becomes overfit",
        "Model trains faster",
        "Model ignores small features"
      ],
      "correctAnswerIndex": 1,
      "explanation": "Too many leaves can make the model overly complex and prone to overfitting."
    },
    {
      "id": 8,
      "questionText": "Which LightGBM parameter controls the learning rate?",
      "options": [
        "shrinkage_rate",
        "alpha",
        "learning_rate",
        "lambda"
      ],
      "correctAnswerIndex": 2,
      "explanation": "The learning_rate parameter determines how much each new tree contributes to the model."
    },
    {
      "id": 9,
      "questionText": "Scenario: Model accuracy stagnates early during training. Which parameter can you increase?",
      "options": [
        "num_leaves",
        "learning_rate",
        "n_estimators",
        "feature_fraction"
      ],
      "correctAnswerIndex": 2,
      "explanation": "Increasing n_estimators (number of boosting iterations) allows the model to learn longer."
    },
    {
      "id": 10,
      "questionText": "What is feature_fraction used for in LightGBM?",
      "options": [
        "Regularization to reduce overfitting by randomly selecting a fraction of features per tree",
        "Adjusting leaf size",
        "Reducing data size by sampling rows",
        "Controlling the number of leaves"
      ],
      "correctAnswerIndex": 0,
      "explanation": "feature_fraction randomly selects a fraction of features to train each tree, helping regularize the model."
    },
    {
      "id": 11,
      "questionText": "Which LightGBM parameter limits tree depth?",
      "options": [
        "max_depth",
        "num_leaves",
        "min_split_gain",
        "subsample"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_depth directly limits how deep trees can grow, preventing overfitting."
    },
    {
      "id": 12,
      "questionText": "What does min_child_samples control?",
      "options": [
        "Minimum number of samples required in a leaf",
        "Minimum number of features used in a tree",
        "Minimum iterations before early stopping",
        "Minimum value for learning rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "min_child_samples ensures that leaves have enough data points, acting as a regularization technique."
    },
    {
      "id": 13,
      "questionText": "Scenario: LightGBM model is overfitting. Which change helps reduce it?",
      "options": [
        "Decrease num_leaves",
        "Increase num_leaves",
        "Increase learning rate",
        "Remove regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing num_leaves decreases model complexity and helps combat overfitting."
    },
    {
      "id": 14,
      "questionText": "What does boosting_type='dart' mean in LightGBM?",
      "options": [
        "It uses Dropouts meet Multiple Additive Regression Trees",
        "It disables boosting",
        "It performs bagging only",
        "It builds random forests"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The DART variant of LightGBM randomly drops trees during boosting to improve generalization."
    },
    {
      "id": 15,
      "questionText": "Which LightGBM boosting type uses dropouts for regularization?",
      "options": [
        "gbdt",
        "dart",
        "goss",
        "rf"
      ],
      "correctAnswerIndex": 1,
      "explanation": "The DART boosting type introduces dropout in boosting to prevent overfitting."
    },
    {
      "id": 16,
      "questionText": "What does goss stand for in LightGBM?",
      "options": [
        "Gradient-based One-Side Sampling",
        "Gradient Optimization Sampling System",
        "Global Outlier Sampling Strategy",
        "Generalized Optimization Split Search"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GOSS is Gradient-based One-Side Sampling — a LightGBM optimization that speeds up training by sampling instances with large gradients."
    },
    {
      "id": 17,
      "questionText": "Which LightGBM parameter helps in row subsampling?",
      "options": [
        "bagging_fraction",
        "feature_fraction",
        "lambda_l1",
        "min_split_gain"
      ],
      "correctAnswerIndex": 0,
      "explanation": "bagging_fraction controls the fraction of data used per iteration, providing row-wise subsampling."
    },
    {
      "id": 18,
      "questionText": "Scenario: You want faster training but can tolerate a small loss in accuracy. Which parameter can you reduce?",
      "options": [
        "bagging_fraction",
        "num_leaves",
        "max_depth",
        "feature_fraction"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing bagging_fraction increases speed by using fewer data rows per iteration."
    },
    {
      "id": 19,
      "questionText": "What does lambda_l1 control in LightGBM?",
      "options": [
        "L1 regularization term on weights",
        "L2 regularization term on weights",
        "Dropout rate",
        "Learning rate decay"
      ],
      "correctAnswerIndex": 0,
      "explanation": "lambda_l1 adds L1 regularization on leaf weights to encourage sparsity and reduce overfitting."
    },
    {
      "id": 20,
      "questionText": "Which LightGBM regularization term penalizes large leaf weights using L2 norm?",
      "options": [
        "lambda_l1",
        "lambda_l2",
        "min_child_samples",
        "feature_fraction"
      ],
      "correctAnswerIndex": 1,
      "explanation": "lambda_l2 applies L2 regularization to prevent large leaf weights and stabilize training."
    },
    {
      "id": 21,
      "questionText": "Scenario: Model accuracy is fluctuating during boosting. Which parameter helps smooth this effect?",
      "options": [
        "learning_rate",
        "num_leaves",
        "min_gain_to_split",
        "bagging_freq"
      ],
      "correctAnswerIndex": 0,
      "explanation": "A smaller learning_rate helps stabilize model updates, reducing fluctuations."
    },
    {
      "id": 22,
      "questionText": "What is the role of min_gain_to_split?",
      "options": [
        "Minimum loss reduction required for a split",
        "Minimum number of leaves required per tree",
        "Maximum number of features allowed",
        "Learning rate decay factor"
      ],
      "correctAnswerIndex": 0,
      "explanation": "min_gain_to_split prevents small, insignificant splits by requiring a minimum loss reduction."
    },
    {
      "id": 23,
      "questionText": "Scenario: Dataset contains categorical variables. How does LightGBM handle them efficiently?",
      "options": [
        "Using one-hot encoding automatically",
        "By internally converting them using optimal split algorithms",
        "By ignoring categorical variables",
        "By treating them as numeric values directly"
      ],
      "correctAnswerIndex": 1,
      "explanation": "LightGBM natively supports categorical features by finding optimal split points without full one-hot encoding."
    },
    {
      "id": 24,
      "questionText": "Which parameter in LightGBM is used to handle categorical features?",
      "options": [
        "categorical_feature",
        "cat_var",
        "cat_split",
        "categorical_index"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The categorical_feature parameter specifies which columns are treated as categorical during training."
    },
    {
      "id": 25,
      "questionText": "Scenario: LightGBM is using GPU for training. Which advantage does this offer?",
      "options": [
        "Faster histogram construction and split finding",
        "Automatic feature engineering",
        "Better interpretability",
        "Improved regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GPU acceleration speeds up histogram creation and split computations, significantly reducing training time."
    }
  ]
}
{
  "title": "LightGBM Mastery: Part 2 (Q26–50)",
  "description": "Questions 26–50 exploring LightGBM tuning — sampling, regularization, parameter interactions, parallelization, and practical training strategies.",
  "questions": [
    {
      "id": 26,
      "questionText": "What does the bagging_freq parameter control in LightGBM?",
      "options": [
        "How frequently bagging is performed during training",
        "Number of features to drop per tree",
        "Learning rate schedule frequency",
        "Regularization update interval"
      ],
      "correctAnswerIndex": 0,
      "explanation": "bagging_freq determines after how many boosting iterations LightGBM performs row subsampling."
    },
    {
      "id": 27,
      "questionText": "Which LightGBM optimization helps in handling large-scale datasets efficiently?",
      "options": [
        "Histogram-based binning",
        "Deep tree expansion",
        "Dynamic pruning",
        "Recurrent boosting"
      ],
      "correctAnswerIndex": 0,
      "explanation": "LightGBM uses histogram-based binning to reduce computation and memory usage for large datasets."
    },
    {
      "id": 28,
      "questionText": "Scenario: Dataset contains extreme class imbalance. Which parameter helps mitigate it?",
      "options": [
        "scale_pos_weight",
        "bagging_fraction",
        "num_leaves",
        "min_child_samples"
      ],
      "correctAnswerIndex": 0,
      "explanation": "scale_pos_weight adjusts the relative weight of positive samples to handle imbalanced datasets effectively."
    },
    {
      "id": 29,
      "questionText": "Which LightGBM setting should be increased to make the model less sensitive to noise?",
      "options": [
        "min_child_samples",
        "num_leaves",
        "learning_rate",
        "max_depth"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Increasing min_child_samples ensures leaves contain more data points, making the model more robust to noise."
    },
    {
      "id": 30,
      "questionText": "What is the role of early_stopping_round in LightGBM training?",
      "options": [
        "Stops training when validation loss does not improve after a certain number of rounds",
        "Reduces learning rate automatically",
        "Saves best iteration for retraining",
        "Increases number of leaves gradually"
      ],
      "correctAnswerIndex": 0,
      "explanation": "early_stopping_round halts training if performance on validation data stops improving."
    },
    {
      "id": 31,
      "questionText": "Scenario: You observe that LightGBM trains very fast but underfits. What adjustment helps?",
      "options": [
        "Increase num_leaves or n_estimators",
        "Decrease learning rate",
        "Reduce max_depth",
        "Reduce feature_fraction"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Increasing num_leaves or n_estimators allows the model to capture more complexity and reduce underfitting."
    },
    {
      "id": 32,
      "questionText": "What does LightGBM’s leaf-wise tree growth mean?",
      "options": [
        "It splits the leaf with the highest loss reduction first",
        "It splits all leaves at the same level simultaneously",
        "It grows the tree symmetrically",
        "It uses fixed depth trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Leaf-wise growth selects and splits the leaf that gives the greatest loss reduction, leading to faster convergence."
    },
    {
      "id": 33,
      "questionText": "Which parameter combination most affects model complexity?",
      "options": [
        "num_leaves and max_depth",
        "feature_fraction and bagging_fraction",
        "learning_rate and n_estimators",
        "lambda_l1 and lambda_l2"
      ],
      "correctAnswerIndex": 0,
      "explanation": "num_leaves and max_depth jointly control tree structure and hence the complexity of the model."
    },
    {
      "id": 34,
      "questionText": "Scenario: LightGBM runs out of memory on a massive dataset. Which setting helps reduce memory usage?",
      "options": [
        "Reduce max_bin",
        "Increase learning_rate",
        "Set boosting_type to dart",
        "Increase num_leaves"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing max_bin decreases the number of histogram bins, lowering memory requirements."
    },
    {
      "id": 35,
      "questionText": "What does the parameter max_bin represent in LightGBM?",
      "options": [
        "Maximum number of bins to bucket continuous features",
        "Maximum number of leaves per tree",
        "Maximum depth of trees",
        "Maximum iterations for convergence"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_bin determines how many discrete bins each feature will be divided into during histogram building."
    },
    {
      "id": 36,
      "questionText": "Scenario: Model training takes too long. Which adjustment improves speed most effectively?",
      "options": [
        "Reduce max_bin or use bagging_fraction < 1",
        "Increase num_leaves",
        "Reduce learning_rate only",
        "Increase regularization terms"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing max_bin or using smaller bagging_fraction reduces the dataset processed each iteration, speeding up training."
    },
    {
      "id": 37,
      "questionText": "What is the primary drawback of leaf-wise tree growth?",
      "options": [
        "Higher risk of overfitting on small data",
        "Slower convergence",
        "Worse performance on large datasets",
        "Poor categorical handling"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Leaf-wise growth can overfit on small datasets because it may produce very deep trees."
    },
    {
      "id": 38,
      "questionText": "Which LightGBM parameter defines how many bins are created for each feature?",
      "options": [
        "max_bin",
        "num_leaves",
        "feature_fraction",
        "max_depth"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_bin sets how finely continuous features are bucketed into discrete bins for histogram-based learning."
    },
    {
      "id": 39,
      "questionText": "What type of regularization do lambda_l1 and lambda_l2 correspond to?",
      "options": [
        "Lasso and Ridge regularization",
        "Elastic Net regularization",
        "Dropout regularization",
        "Tree pruning regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "lambda_l1 and lambda_l2 implement Lasso (L1) and Ridge (L2) regularization respectively."
    },
    {
      "id": 40,
      "questionText": "Scenario: You observe overfitting with high validation error. Which parameters help reduce it?",
      "options": [
        "Increase min_child_samples, decrease num_leaves",
        "Increase learning_rate, increase num_leaves",
        "Reduce lambda_l2",
        "Increase feature_fraction"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Increasing min_child_samples and reducing num_leaves simplify the model, reducing overfitting."
    },
    {
      "id": 41,
      "questionText": "What is the role of monotone_constraints in LightGBM?",
      "options": [
        "Ensure certain features have monotonic relationships with the target",
        "Enforce equal feature importance",
        "Reduce overfitting using L2 regularization",
        "Apply monotonic normalization to inputs"
      ],
      "correctAnswerIndex": 0,
      "explanation": "monotone_constraints force LightGBM to maintain a monotonic relationship for specific features."
    },
    {
      "id": 42,
      "questionText": "Scenario: You want reproducible results from LightGBM training. Which parameter helps?",
      "options": [
        "random_state",
        "seed",
        "boosting_seed",
        "Any of the above"
      ],
      "correctAnswerIndex": 3,
      "explanation": "Setting seed or random_state ensures deterministic behavior in LightGBM training."
    },
    {
      "id": 43,
      "questionText": "Which LightGBM parameter defines the objective function?",
      "options": [
        "objective",
        "metric",
        "boosting_type",
        "learning_rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "The objective parameter defines the loss function that LightGBM optimizes, e.g., 'binary', 'regression'."
    },
    {
      "id": 44,
      "questionText": "Scenario: You are using LightGBM for multi-class classification. What should the objective be set to?",
      "options": [
        "multiclass",
        "multiclassova",
        "binary",
        "regression"
      ],
      "correctAnswerIndex": 0,
      "explanation": "For multi-class classification, objective='multiclass' should be used with num_class specified."
    },
    {
      "id": 45,
      "questionText": "What does feature_pre_filter in LightGBM control?",
      "options": [
        "Whether features are pre-screened before training",
        "Feature normalization",
        "Automatic feature selection during training",
        "Dropout of low importance features"
      ],
      "correctAnswerIndex": 0,
      "explanation": "feature_pre_filter determines if LightGBM filters out constant or low-variance features before training."
    },
    {
      "id": 46,
      "questionText": "Scenario: You use categorical features in LightGBM. What advantage does native support provide?",
      "options": [
        "Faster training and memory efficiency",
        "One-hot encoding automatically expands features",
        "Improved interpretability",
        "Model regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Native categorical handling avoids one-hot expansion, leading to faster and more efficient training."
    },
    {
      "id": 47,
      "questionText": "Which LightGBM boosting type is best for highly imbalanced datasets?",
      "options": [
        "goss",
        "gbdt",
        "dart",
        "rf"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GOSS (Gradient-based One-Side Sampling) is efficient and works well for imbalanced datasets by focusing on large-gradient samples."
    },
    {
      "id": 48,
      "questionText": "Scenario: LightGBM shows slightly worse accuracy than XGBoost. Which parameter tuning may help?",
      "options": [
        "Reduce learning_rate and increase n_estimators",
        "Increase feature_fraction",
        "Reduce num_leaves",
        "Disable histogram optimization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Decreasing learning_rate while increasing n_estimators allows more refined learning and may improve accuracy."
    },
    {
      "id": 49,
      "questionText": "Which LightGBM metric should you use for binary classification?",
      "options": [
        "binary_logloss",
        "l2",
        "mae",
        "multi_logloss"
      ],
      "correctAnswerIndex": 0,
      "explanation": "binary_logloss measures the log loss for binary classification tasks."
    },
    {
      "id": 50,
      "questionText": "Scenario: You want to save training time without losing much performance. Which parameters can be combined?",
      "options": [
        "Use smaller max_bin, bagging_fraction, and feature_fraction",
        "Increase num_leaves and n_estimators",
        "Reduce learning_rate only",
        "Disable regularization terms"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing max_bin, bagging_fraction, and feature_fraction reduces training cost while maintaining accuracy."
    }
  ]
}
{
  "title": "LightGBM Mastery: Part 3 (Q51–75)",
  "description": "Intermediate to advanced questions (51–75) exploring LightGBM regularization, overfitting control, advanced parameters, and real-world optimization scenarios.",
  "questions": [
    {
      "id": 51,
      "questionText": "What is the function of lambda_l1 in LightGBM?",
      "options": [
        "Applies L1 regularization to leaf weights",
        "Limits number of leaves per tree",
        "Determines the learning rate",
        "Applies L2 regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "lambda_l1 controls L1 regularization on leaf weights to reduce overfitting."
    },
    {
      "id": 52,
      "questionText": "What is the function of lambda_l2 in LightGBM?",
      "options": [
        "Applies L2 regularization to leaf weights",
        "Controls learning rate",
        "Reduces feature fraction",
        "Prunes shallow trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "lambda_l2 applies L2 regularization on leaf weights to smooth large values and improve generalization."
    },
    {
      "id": 53,
      "questionText": "Scenario: You notice LightGBM overfits heavily. Which parameters could help?",
      "options": [
        "Decrease num_leaves, increase min_data_in_leaf",
        "Increase num_leaves, lower min_data_in_leaf",
        "Increase learning rate",
        "Increase max_depth only"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing num_leaves and increasing min_data_in_leaf makes trees simpler and prevents overfitting."
    },
    {
      "id": 54,
      "questionText": "What does 'min_split_gain' control in LightGBM?",
      "options": [
        "Minimum gain required to perform a split",
        "Maximum gain per leaf",
        "Learning rate adjustment",
        "Subsample fraction"
      ],
      "correctAnswerIndex": 0,
      "explanation": "min_split_gain sets a threshold for information gain; splits below this are ignored."
    },
    {
      "id": 55,
      "questionText": "Which LightGBM parameter controls row sampling per iteration?",
      "options": [
        "bagging_fraction",
        "feature_fraction",
        "num_leaves",
        "max_bin"
      ],
      "correctAnswerIndex": 0,
      "explanation": "bagging_fraction randomly samples rows for each boosting round to reduce variance."
    },
    {
      "id": 56,
      "questionText": "What does 'feature_fraction' control in LightGBM?",
      "options": [
        "Fraction of features used per tree",
        "Number of bins per feature",
        "Maximum depth of tree",
        "Feature scaling"
      ],
      "correctAnswerIndex": 0,
      "explanation": "feature_fraction specifies the proportion of features used to build each tree."
    },
    {
      "id": 57,
      "questionText": "Scenario: Increasing feature_fraction improves accuracy but lowers stability. Why?",
      "options": [
        "More features increase model variance",
        "Feature_fraction reduces bias",
        "Learning rate decreases automatically",
        "Tree depth is fixed"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Using more features per iteration reduces randomness and increases variance, which may reduce stability."
    },
    {
      "id": 58,
      "questionText": "Which LightGBM parameter controls data sampling frequency?",
      "options": [
        "bagging_freq",
        "num_iterations",
        "min_child_samples",
        "max_depth"
      ],
      "correctAnswerIndex": 0,
      "explanation": "bagging_freq controls how often (in iterations) row subsampling is performed."
    },
    {
      "id": 59,
      "questionText": "Scenario: You set bagging_fraction=1.0 and feature_fraction=1.0. Effect?",
      "options": [
        "No random sampling; all data and features used every iteration",
        "Strong regularization",
        "Improved generalization",
        "Subsampling increases variance"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Setting both to 1.0 disables random sampling, using all features and samples every iteration."
    },
    {
      "id": 60,
      "questionText": "What does 'max_bin' affect in LightGBM?",
      "options": [
        "Precision of feature discretization",
        "Learning rate",
        "Tree depth",
        "Bagging rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_bin determines how many bins each continuous feature is bucketed into for histogram-based splitting."
    },
    {
      "id": 61,
      "questionText": "Higher max_bin values typically lead to what?",
      "options": [
        "More precise splits but slower training",
        "Faster training with less precision",
        "More regularization",
        "Smaller trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Increasing max_bin gives more precise splits but increases memory and training time."
    },
    {
      "id": 62,
      "questionText": "Scenario: Large dataset with limited RAM. What should you adjust?",
      "options": [
        "Decrease max_bin and num_leaves",
        "Increase learning rate",
        "Disable histogram mode",
        "Increase max_depth"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing max_bin and num_leaves lowers memory footprint and speeds up training."
    },
    {
      "id": 63,
      "questionText": "What is the function of 'min_data_in_leaf'?",
      "options": [
        "Minimum number of samples required to form a leaf",
        "Maximum depth limit",
        "Learning rate controller",
        "Number of leaves in total"
      ],
      "correctAnswerIndex": 0,
      "explanation": "min_data_in_leaf ensures a minimum number of samples per leaf to avoid overfitting."
    },
    {
      "id": 64,
      "questionText": "What happens if min_data_in_leaf is set too high?",
      "options": [
        "Model underfits due to shallow trees",
        "Model overfits easily",
        "Learning rate decreases",
        "Training stops early"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Too high min_data_in_leaf makes leaves large and reduces model complexity, causing underfitting."
    },
    {
      "id": 65,
      "questionText": "What parameter limits the maximum tree depth in LightGBM?",
      "options": [
        "max_depth",
        "num_leaves",
        "min_data_in_leaf",
        "feature_fraction"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_depth caps how deep each tree can grow."
    },
    {
      "id": 66,
      "questionText": "Scenario: You set max_depth=-1. What happens?",
      "options": [
        "Tree depth is unlimited and controlled by num_leaves instead",
        "Training fails",
        "Trees become shallow automatically",
        "Regularization is disabled"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Setting max_depth=-1 removes explicit depth restriction; num_leaves indirectly limits complexity."
    },
    {
      "id": 67,
      "questionText": "What is the effect of increasing num_iterations in LightGBM?",
      "options": [
        "Model trains longer and may overfit if learning_rate is not reduced",
        "Model converges faster",
        "Less accurate model",
        "Shallower trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "More boosting iterations improve fit but can overfit unless compensated by lower learning rate."
    },
    {
      "id": 68,
      "questionText": "Scenario: Reducing learning_rate but keeping num_iterations constant causes?",
      "options": [
        "Underfitting, since model learns slower",
        "Overfitting",
        "Higher variance",
        "Deeper trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Low learning rate with few iterations may lead to underfitting as the model learns too slowly."
    },
    {
      "id": 69,
      "questionText": "What is the main benefit of histogram-based decision trees in LightGBM?",
      "options": [
        "Faster training and lower memory usage",
        "More precise split thresholds",
        "Supports only small datasets",
        "Improves interpretability"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Histogram-based methods speed up training by grouping continuous values into discrete bins."
    },
    {
      "id": 70,
      "questionText": "Scenario: You increase max_bin significantly. What might happen?",
      "options": [
        "Training slows down and may overfit",
        "Training speeds up",
        "Model ignores rare features",
        "Learning rate increases automatically"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Higher max_bin allows finer splits but can increase overfitting and computation time."
    },
    {
      "id": 71,
      "questionText": "What parameter controls the number of boosting rounds?",
      "options": [
        "num_iterations",
        "max_depth",
        "feature_fraction",
        "min_data_in_leaf"
      ],
      "correctAnswerIndex": 0,
      "explanation": "num_iterations defines the total number of boosting rounds (trees) to train."
    },
    {
      "id": 72,
      "questionText": "Scenario: Decreasing num_iterations while keeping learning_rate fixed will usually?",
      "options": [
        "Reduce model capacity and may underfit",
        "Cause overfitting",
        "Speed up convergence with higher accuracy",
        "Have no effect"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Fewer iterations reduce model capacity, leading to underfitting if learning_rate is unchanged."
    },
    {
      "id": 73,
      "questionText": "What is the benefit of early_stopping_rounds in LightGBM?",
      "options": [
        "Automatically halts training when validation loss stops improving",
        "Reduces learning rate dynamically",
        "Increases tree depth automatically",
        "Samples more features"
      ],
      "correctAnswerIndex": 0,
      "explanation": "early_stopping_rounds prevents overfitting by stopping when performance stops improving on validation data."
    },
    {
      "id": 74,
      "questionText": "Which parameter combination best prevents overfitting?",
      "options": [
        "Lower num_leaves, lower learning_rate, higher min_data_in_leaf",
        "Higher num_leaves, higher learning_rate",
        "Increase max_depth only",
        "Set bagging_fraction=1"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Simpler trees, smaller learning rate, and more data per leaf enhance generalization."
    },
    {
      "id": 75,
      "questionText": "Scenario: Large data, strong overfitting, and high variance. What to do?",
      "options": [
        "Lower num_leaves, use bagging and feature_fraction < 1",
        "Increase tree depth",
        "Raise learning rate",
        "Disable regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Using smaller trees and random sampling helps reduce overfitting and variance."
    }
  ]
}
{
  "title": "LightGBM Mastery: Part 4 (Q76–100)",
  "description": "Advanced and expert-level questions (76–100) exploring LightGBM’s GPU acceleration, categorical feature encoding, distributed learning, interpretability, and fine-tuning strategies.",
  "questions": [
    {
      "id": 76,
      "questionText": "What is one key advantage of LightGBM over XGBoost?",
      "options": [
        "Uses leaf-wise tree growth for faster convergence",
        "Uses level-wise tree growth for stability",
        "Cannot handle large datasets",
        "Lacks regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "LightGBM grows trees leaf-wise with depth constraints, achieving faster convergence and lower loss."
    },
    {
      "id": 77,
      "questionText": "What happens if num_leaves is much larger than 2^max_depth?",
      "options": [
        "Overfitting increases due to overly complex trees",
        "Model underfits severely",
        "No effect on model performance",
        "Training halts automatically"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Too many leaves compared to max_depth allow excessive branching, causing overfitting."
    },
    {
      "id": 78,
      "questionText": "Scenario: You enable GPU support in LightGBM. What primary benefit is expected?",
      "options": [
        "Faster histogram construction and split finding",
        "Improved accuracy",
        "Reduced model complexity",
        "Automatic regularization"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GPU acceleration speeds up histogram building and split calculations, improving training speed."
    },
    {
      "id": 79,
      "questionText": "Which LightGBM parameter enables GPU training?",
      "options": [
        "device_type='gpu'",
        "gpu_enable=True",
        "use_gpu=1",
        "boosting_type='gpu'"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Setting device_type='gpu' tells LightGBM to use GPU resources for training."
    },
    {
      "id": 80,
      "questionText": "What is the impact of 'boosting_type' parameter?",
      "options": [
        "Selects the boosting algorithm (gbdt, dart, goss)",
        "Controls feature sampling",
        "Defines tree depth",
        "Applies learning rate decay"
      ],
      "correctAnswerIndex": 0,
      "explanation": "boosting_type specifies the boosting algorithm variant such as 'gbdt', 'dart', or 'goss'."
    },
    {
      "id": 81,
      "questionText": "What is DART in LightGBM?",
      "options": [
        "Dropouts meet Multiple Additive Regression Trees",
        "Distributed Automatic Regression Tree",
        "Dynamic Adaptive Regularized Trees",
        "Data Adaptive Reduction Technique"
      ],
      "correctAnswerIndex": 0,
      "explanation": "DART is 'Dropouts meet Multiple Additive Regression Trees', introducing dropout into boosting to reduce overfitting."
    },
    {
      "id": 82,
      "questionText": "Scenario: Using boosting_type='goss'. What does GOSS stand for?",
      "options": [
        "Gradient-based One-Side Sampling",
        "Global Overfitting Sample Selector",
        "Generalized Optimization for Split Search",
        "Gradient Optimization Sampling Strategy"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GOSS stands for Gradient-based One-Side Sampling, reducing data processed per iteration for speed."
    },
    {
      "id": 83,
      "questionText": "What does GOSS primarily do?",
      "options": [
        "Keeps large-gradient samples and randomly drops small-gradient ones",
        "Drops large-gradient samples",
        "Uses all samples equally",
        "Increases number of trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "GOSS keeps high-gradient samples for training, reducing computation while preserving accuracy."
    },
    {
      "id": 84,
      "questionText": "Scenario: Dataset has many categorical variables. What should you do?",
      "options": [
        "Use LightGBM's built-in categorical feature support",
        "One-hot encode all features manually",
        "Convert to text data",
        "Ignore categorical columns"
      ],
      "correctAnswerIndex": 0,
      "explanation": "LightGBM natively supports categorical features through optimal split encoding without full one-hot expansion."
    },
    {
      "id": 85,
      "questionText": "How does LightGBM handle categorical features internally?",
      "options": [
        "Sorts categories by average target and finds best split",
        "Performs label encoding only",
        "Uses frequency encoding",
        "Applies hash bucketing"
      ],
      "correctAnswerIndex": 0,
      "explanation": "LightGBM sorts categories by their target mean to efficiently find the best split."
    },
    {
      "id": 86,
      "questionText": "Scenario: Training time is long on large data with many features. What can help?",
      "options": [
        "Reduce feature_fraction and bagging_fraction",
        "Increase num_leaves",
        "Disable histogram mode",
        "Increase max_bin drastically"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing feature_fraction and bagging_fraction speeds up training by using subsets of features and samples."
    },
    {
      "id": 87,
      "questionText": "What is the role of 'max_cat_threshold'?",
      "options": [
        "Controls maximum thresholds for categorical splits",
        "Limits maximum tree depth",
        "Sets number of categories allowed",
        "Defines learning rate schedule"
      ],
      "correctAnswerIndex": 0,
      "explanation": "max_cat_threshold limits how many thresholds LightGBM evaluates for categorical splits."
    },
    {
      "id": 88,
      "questionText": "Scenario: Distributed LightGBM training is producing inconsistent results. Likely reason?",
      "options": [
        "Non-deterministic data shuffling or parameter differences across nodes",
        "Too high learning rate",
        "Disabled GPU support",
        "Overfitting due to small num_leaves"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Different random seeds or node configurations in distributed mode can cause inconsistency."
    },
    {
      "id": 89,
      "questionText": "What helps ensure reproducible LightGBM results?",
      "options": [
        "Set deterministic=True and fix random_seed",
        "Increase bagging_fraction",
        "Enable GPU mode",
        "Reduce learning rate"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Setting deterministic=True and fixing random_seed ensures consistent results across runs."
    },
    {
      "id": 90,
      "questionText": "Which LightGBM feature allows parallel learning across machines?",
      "options": [
        "Distributed training mode",
        "Bagging",
        "GPU histograms",
        "Early stopping"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Distributed mode enables training across multiple machines using data parallelism."
    },
    {
      "id": 91,
      "questionText": "What is the key difference between DART and standard GBDT?",
      "options": [
        "DART randomly drops trees during training to prevent overfitting",
        "DART doubles learning rate dynamically",
        "DART uses fewer features per tree",
        "DART cannot perform regression tasks"
      ],
      "correctAnswerIndex": 0,
      "explanation": "DART introduces dropout on trees, improving regularization and generalization."
    },
    {
      "id": 92,
      "questionText": "Scenario: Validation accuracy fluctuates heavily between iterations. Likely cause?",
      "options": [
        "Learning rate too high or bagging too aggressive",
        "Too many trees",
        "Too few bins",
        "High lambda_l2"
      ],
      "correctAnswerIndex": 0,
      "explanation": "High learning rate or aggressive subsampling can cause instability in validation metrics."
    },
    {
      "id": 93,
      "questionText": "What does 'linear_tree' parameter enable?",
      "options": [
        "Adds linear models to each leaf for hybrid boosting",
        "Switches boosting type",
        "Performs polynomial regression",
        "Forces shallow trees"
      ],
      "correctAnswerIndex": 0,
      "explanation": "linear_tree enables a linear model within each leaf, combining tree and linear learning."
    },
    {
      "id": 94,
      "questionText": "Scenario: Using linear_tree improved performance slightly but increased training time. Why?",
      "options": [
        "Linear models per leaf require additional optimization",
        "Learning rate reduced automatically",
        "Tree structure became shallower",
        "Fewer bins created per feature"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Each leaf fits a small linear model, increasing training computation but often improving accuracy."
    },
    {
      "id": 95,
      "questionText": "Which LightGBM setting improves memory efficiency on large data?",
      "options": [
        "Use histogram pool sharing and smaller max_bin",
        "Increase max_depth",
        "Enable linear_tree",
        "Disable bagging"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Reducing max_bin and using histogram sharing significantly lower memory usage."
    },
    {
      "id": 96,
      "questionText": "Scenario: Feature importance shows unexpected zeros for numeric features. Why?",
      "options": [
        "Feature was rarely used due to high correlation or low information gain",
        "Model error",
        "Bug in LightGBM",
        "Feature_fraction=1.0"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Highly correlated or uninformative features may never be chosen for splits, yielding zero importance."
    },
    {
      "id": 97,
      "questionText": "What is the purpose of 'monotone_constraints'?",
      "options": [
        "Forces model predictions to follow specified monotonic relationships with features",
        "Restricts tree depth",
        "Balances data classes",
        "Disables early stopping"
      ],
      "correctAnswerIndex": 0,
      "explanation": "monotone_constraints ensure predictions move consistently up or down with certain features."
    },
    {
      "id": 98,
      "questionText": "Scenario: You set monotone_constraints incorrectly. Possible issue?",
      "options": [
        "Model accuracy drops or fails to converge",
        "Training halts immediately",
        "All features are ignored",
        "Learning rate resets"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Wrong monotonic constraints can make optimization infeasible, harming accuracy or convergence."
    },
    {
      "id": 99,
      "questionText": "What metric would you monitor for binary classification?",
      "options": [
        "binary_logloss or AUC",
        "mean_squared_error",
        "poisson",
        "quantile"
      ],
      "correctAnswerIndex": 0,
      "explanation": "For binary tasks, LightGBM supports metrics like binary_logloss and AUC for evaluation."
    },
    {
      "id": 100,
      "questionText": "Scenario: After tuning, training accuracy improves but test accuracy drops. What happened?",
      "options": [
        "Overfitting",
        "Underfitting",
        "Learning rate too small",
        "Too many missing values"
      ],
      "correctAnswerIndex": 0,
      "explanation": "Higher training accuracy with lower test performance indicates overfitting."
    }
  ]
}