Spaces:

deedrop1140
/

MachineLearningAlgorithms

Running

File size: 49,472 Bytes

0d00d62

  {
    "title": "K-Means Clustering Mastery: 100 MCQs",
    "description": "A comprehensive set of 100 multiple-choice questions designed to test and deepen your understanding of K-Means Clustering, covering basic concepts, algorithm steps, practical scenarios, and challenges in high-dimensional or real-world datasets.",
    "questions": [
      {
        "id": 1,
        "questionText": "What is the primary goal of K-Means Clustering?",
        "options": [
          "Partition data into K clusters minimizing within-cluster variance",
          "Reduce the dimensionality of the dataset",
          "Detect outliers in the dataset",
          "Classify data into predefined categories"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means aims to divide data into K clusters such that the sum of squared distances between points and their cluster centroid is minimized."
      },
      {
        "id": 2,
        "questionText": "In K-Means, what does a 'centroid' represent?",
        "options": [
          "A random point from the dataset",
          "The farthest point from the cluster",
          "The maximum value in the cluster",
          "The mean position of all points in the cluster"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Centroid is the mean of all points in a cluster and represents the cluster's center."
      },
      {
        "id": 3,
        "questionText": "Which step is repeated in K-Means until convergence?",
        "options": [
          "Compute correlation matrix",
          "Assign points to nearest centroid and update centroids",
          "Remove outliers",
          "Randomly shuffle data points"
        ],
        "correctAnswerIndex": 1,
        "explanation": "K-Means iteratively assigns points to the nearest centroid and recalculates centroids until assignments stabilize."
      },
      {
        "id": 4,
        "questionText": "Scenario: K-Means converges but clusters are uneven in size. Likely reason?",
        "options": [
          "Centroids are incorrect",
          "Distance metric used is Euclidean",
          "Data distribution is skewed",
          "Algorithm failed"
        ],
        "correctAnswerIndex": 2,
        "explanation": "K-Means partitions based on distances; skewed or non-spherical distributions can lead to uneven cluster sizes."
      },
      {
        "id": 5,
        "questionText": "What is the main limitation of K-Means clustering?",
        "options": [
          "Sensitive to outliers",
          "Requires predefined number of clusters (K)",
          "Only works for numerical data",
          "All of the above"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means works only with numerical data, needs K as input, and is sensitive to outliers."
      },
      {
        "id": 6,
        "questionText": "Scenario: K-Means applied to customer locations. Distance metric to use?",
        "options": [
          "Euclidean distance",
          "Hamming distance",
          "Cosine similarity",
          "Jaccard index"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Euclidean distance is standard for K-Means and spatial numerical data."
      },
      {
        "id": 7,
        "questionText": "Scenario: K-Means on 2D points results vary with different initial centroids. Solution?",
        "options": [
          "Use hierarchical clustering instead",
          "Ignore initial centroids",
          "Reduce K",
          "Use K-Means++ initialization"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means++ selects better initial centroids to improve convergence and consistency."
      },
      {
        "id": 8,
        "questionText": "Scenario: K-Means on concentric circles fails. Reason?",
        "options": [
          "K-Means assumes spherical clusters",
          "Data contains outliers",
          "Distance metric wrong",
          "Algorithm converged too quickly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means works best for convex, spherical clusters; it cannot separate concentric circular clusters."
      },
      {
        "id": 9,
        "questionText": "Scenario: After K-Means clustering, silhouette score is low. Interpretation?",
        "options": [
          "Noise ignored automatically",
          "Clusters overlap or poorly defined",
          "Clusters are perfect",
          "Algorithm converged correctly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Low silhouette score indicates points are close to neighboring cluster centroids; clusters are not well separated."
      },
      {
        "id": 10,
        "questionText": "Scenario: Large dataset with millions of points. K-Means limitation?",
        "options": [
          "Algorithm fails completely",
          "Distance metric is irrelevant",
          "Cannot calculate centroids",
          "Convergence can be slow; consider Mini-Batch K-Means"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Mini-Batch K-Means is a faster variant suitable for large datasets."
      },
      {
        "id": 11,
        "questionText": "Scenario: K-Means on a dataset with outliers. Effect?",
        "options": [
          "Algorithm removes outliers",
          "Centroids can shift towards outliers, distorting clusters",
          "Clusters become more compact",
          "Clusters ignore outliers automatically"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Outliers can disproportionately affect centroids, leading to poorly defined clusters."
      },
      {
        "id": 12,
        "questionText": "Scenario: K-Means on categorical data. Limitation?",
        "options": [
          "K-Means requires numerical data; cannot handle categorical directly",
          "Clusters merge randomly",
          "Categorical data improves clustering",
          "Algorithm automatically encodes categories"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means relies on distance metrics, which are not directly defined for categorical data."
      },
      {
        "id": 13,
        "questionText": "Scenario: Selecting K for K-Means. Which method helps?",
        "options": [
          "Merge dendrograms",
          "Random selection",
          "Elbow method",
          "Silhouette ignored"
        ],
        "correctAnswerIndex": 2,
        "explanation": "The Elbow method plots sum of squared errors vs K and identifies an 'elbow' point as optimal K."
      },
      {
        "id": 14,
        "questionText": "Scenario: K-Means fails to separate overlapping clusters. Likely reason?",
        "options": [
          "K too small",
          "Centroids are optimal",
          "Algorithm converged correctly",
          "Clusters are not well-separated or non-convex"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means works best for well-separated convex clusters; overlapping clusters are challenging."
      },
      {
        "id": 15,
        "questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
        "options": [
          "Algorithm fails automatically",
          "Noise ignored",
          "Distance metrics lose meaning; consider PCA or feature scaling",
          "Clusters are always compact"
        ],
        "correctAnswerIndex": 2,
        "explanation": "High-dimensional spaces dilute distances; dimensionality reduction improves clustering."
      },
      {
        "id": 16,
        "questionText": "Scenario: K-Means with K too large. Effect?",
        "options": [
          "Algorithm automatically reduces K",
          "Clusters always improve",
          "Clusters may become small and meaningless",
          "Noise ignored"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Choosing K too large can lead to many tiny clusters with no meaningful pattern."
      },
      {
        "id": 17,
        "questionText": "Scenario: Mini-Batch K-Means. Advantage?",
        "options": [
          "Faster for large datasets with approximate centroids",
          "Removes noise automatically",
          "More accurate than standard K-Means",
          "Works only on small datasets"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Mini-Batch K-Means updates centroids using small random batches for efficiency on large datasets."
      },
      {
        "id": 18,
        "questionText": "Scenario: K-Means initialization affects results. Solution?",
        "options": [
          "Use single random centroid only",
          "Run algorithm multiple times or use K-Means++",
          "Ignore initialization",
          "Reduce K randomly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "K-Means++ and multiple runs improve stability and reduce sensitivity to initial centroids."
      },
      {
        "id": 19,
        "questionText": "Scenario: K-Means distance metric. Standard choice?",
        "options": [
          "Euclidean distance",
          "Jaccard index",
          "Hamming distance",
          "Cosine similarity"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means typically uses Euclidean distance to assign points to nearest centroids."
      },
      {
        "id": 20,
        "questionText": "Scenario: K-Means clustering produces empty clusters. Cause?",
        "options": [
          "Algorithm failed",
          "Clusters are compact",
          "No points assigned to some centroids",
          "Centroids are optimal"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Some centroids may not attract any points, leading to empty clusters."
      },
      {
        "id": 21,
        "questionText": "Scenario: K-Means on text embeddings. Preprocessing required?",
        "options": [
          "Use categorical K-Means directly",
          "No preprocessing needed",
          "Randomly assign clusters",
          "Normalize or scale vectors before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Text embeddings often need normalization to prevent certain dimensions from dominating distance computations."
      },
      {
        "id": 22,
        "questionText": "Scenario: K-Means with very high K. Effect on SSE (sum of squared errors)?",
        "options": [
          "SSE increases",
          "SSE is ignored",
          "SSE decreases as K increases",
          "SSE remains constant"
        ],
        "correctAnswerIndex": 2,
        "explanation": "As K increases, each cluster contains fewer points, reducing the sum of squared distances."
      },
      {
        "id": 23,
        "questionText": "Scenario: K-Means on scaled vs unscaled features. Effect?",
        "options": [
          "Clusters merge randomly",
          "Scaling is important; features with large range dominate clustering",
          "Algorithm fails if not scaled",
          "Scaling is unnecessary"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Features with larger numerical ranges can dominate Euclidean distance; scaling ensures fair contribution."
      },
      {
        "id": 24,
        "questionText": "Scenario: K-Means convergence criteria. Standard check?",
        "options": [
          "Centroid positions or cluster assignments stop changing",
          "Distance metric ignored",
          "Random stopping",
          "Maximum iterations only"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Algorithm stops when centroids or cluster assignments stabilize, or after a max number of iterations."
      },
      {
        "id": 25,
        "questionText": "Scenario: K-Means for image compression. How?",
        "options": [
          "Cluster pixel colors and replace each pixel by its centroid",
          "Use hierarchical clustering",
          "Remove noise automatically",
          "Reduce image resolution"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means clusters similar colors, allowing image compression by using cluster centroids as representative colors."
      },
      {
        "id": 26,
        "questionText": "Scenario: K-Means for customer segmentation. Benefit?",
        "options": [
          "Identify customer groups for targeted marketing",
          "Automatically predicts sales",
          "Detects trends over time",
          "Removes outliers"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means helps segment customers based on behavior, allowing targeted campaigns."
      },
      {
        "id": 27,
        "questionText": "Scenario: K-Means clustering results differ on repeated runs. Cause?",
        "options": [
          "Distance metric varies",
          "Random initialization of centroids",
          "Algorithm deterministic",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Random initial centroids can lead to different final clusters; K-Means++ mitigates this."
      },
      {
        "id": 28,
        "questionText": "Scenario: K-Means for anomaly detection. Approach?",
        "options": [
          "Clusters merge randomly",
          "Points far from nearest centroid may be anomalies",
          "All points treated equally",
          "Noise automatically ignored"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Outliers are detected as points distant from cluster centroids."
      },
      {
        "id": 29,
        "questionText": "Scenario: K-Means clustering on geospatial data. Best practice?",
        "options": [
          "Use distance metric appropriate for coordinates (e.g., haversine)",
          "Randomly assign clusters",
          "Use Euclidean blindly",
          "Clusters merge arbitrarily"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Euclidean distance may misrepresent geographic distances; use geodesic metrics like haversine."
      },
      {
        "id": 30,
        "questionText": "Scenario: K-Means with highly correlated features. Solution?",
        "options": [
          "Increase K randomly",
          "Apply PCA to reduce correlated dimensions",
          "Ignore correlations",
          "Clusters merge arbitrarily"
        ],
        "correctAnswerIndex": 1,
        "explanation": "PCA reduces correlated features and improves clustering performance."
      },
      {
        "id": 31,
        "questionText": "Scenario: K-Means on non-spherical clusters. Limitation?",
        "options": [
          "Noise ignored",
          "Clusters are always compact",
          "Algorithm automatically adapts",
          "K-Means assumes spherical clusters; non-spherical clusters may be poorly separated"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means relies on Euclidean distance and assumes roughly spherical clusters, so elongated or irregular clusters are not well captured."
      },
      {
        "id": 32,
        "questionText": "Scenario: K-Means clustering produces clusters with very different densities. Challenge?",
        "options": [
          "Distance metric ignored",
          "Clusters always equal",
          "Low-density clusters may be merged incorrectly",
          "Algorithm detects densities automatically"
        ],
        "correctAnswerIndex": 2,
        "explanation": "K-Means does not account for density; clusters with differing densities may not be separated properly."
      },
      {
        "id": 33,
        "questionText": "Scenario: K-Means applied to text embeddings. Best practice?",
        "options": [
          "Increase K arbitrarily",
          "Normalize embeddings to unit vectors before clustering",
          "Remove half the features randomly",
          "Use raw embeddings"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Normalization ensures that distance computation reflects angle similarity rather than magnitude differences."
      },
      {
        "id": 34,
        "questionText": "Scenario: K-Means clustering with missing values. Approach?",
        "options": [
          "Ignore missing values",
          "Randomly assign missing values",
          "Algorithm automatically handles them",
          "Impute missing values before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means requires complete numerical data; missing values should be imputed or removed."
      },
      {
        "id": 35,
        "questionText": "Scenario: K-Means clustering on multi-dimensional customer features. Preprocessing step?",
        "options": [
          "Randomly drop features",
          "Scale features so all dimensions contribute equally",
          "Increase K arbitrarily",
          "Leave features unscaled"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Feature scaling ensures that dimensions with larger ranges do not dominate Euclidean distance."
      },
      {
        "id": 36,
        "questionText": "Scenario: K-Means on a dataset with outliers. Solution?",
        "options": [
          "Use standard K-Means without changes",
          "Randomly assign clusters",
          "Increase K to compensate",
          "Remove or preprocess outliers before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Outliers can distort centroids; preprocessing improves clustering accuracy."
      },
      {
        "id": 37,
        "questionText": "Scenario: K-Means convergence too slow. Solution?",
        "options": [
          "Ignore convergence",
          "Change distance metric arbitrarily",
          "Use Mini-Batch K-Means or reduce dataset size",
          "Increase K randomly"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Mini-Batch K-Means or subsampling speeds up convergence for large datasets."
      },
      {
        "id": 38,
        "questionText": "Scenario: K-Means clustering with highly correlated features. Best approach?",
        "options": [
          "Increase K",
          "Ignore correlation",
          "Merge clusters arbitrarily",
          "Apply PCA or feature selection to reduce redundancy"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Reducing correlated dimensions prevents redundant information from biasing distance calculations."
      },
      {
        "id": 39,
        "questionText": "Scenario: K-Means clustering on skewed data. Issue?",
        "options": [
          "Noise ignored",
          "Algorithm corrects automatically",
          "Clusters may be biased towards dense regions",
          "Clusters always balanced"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Skewed distributions can lead to unequal cluster sizes or poorly defined boundaries."
      },
      {
        "id": 40,
        "questionText": "Scenario: K-Means with K unknown. Methods to select K?",
        "options": [
          "Random choice",
          "Algorithm decides automatically",
          "Use maximum data points",
          "Elbow method, silhouette score, gap statistic"
        ],
        "correctAnswerIndex": 3,
        "explanation": "These methods help determine optimal K by evaluating clustering performance."
      },
      {
        "id": 41,
        "questionText": "Scenario: K-Means produces very similar clusters on repeated runs. Possible reason?",
        "options": [
          "Algorithm converged incorrectly",
          "Data naturally forms stable clusters",
          "Distance metric is wrong",
          "Initialization randomization failed"
        ],
        "correctAnswerIndex": 1,
        "explanation": "If data has well-separated clusters, K-Means results are stable across runs."
      },
      {
        "id": 42,
        "questionText": "Scenario: K-Means on a small dataset with large K. Risk?",
        "options": [
          "Algorithm fails completely",
          "Centroids ignored",
          "Clusters may be too small or empty",
          "Clusters automatically merge"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Too many clusters for small datasets can produce meaningless or empty clusters."
      },
      {
        "id": 43,
        "questionText": "Scenario: K-Means++ initialization. Benefit?",
        "options": [
          "Improves cluster quality by selecting distant initial centroids",
          "Random initialization",
          "Always produces identical clusters",
          "Removes noise automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means++ reduces poor initialization by spreading centroids apart."
      },
      {
        "id": 44,
        "questionText": "Scenario: K-Means with categorical features. Solution?",
        "options": [
          "Use K-Prototypes or encode categories numerically",
          "Clusters merge randomly",
          "Ignore categorical data",
          "Use standard K-Means directly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Standard K-Means cannot handle categorical data; K-Prototypes or encoding is needed."
      },
      {
        "id": 45,
        "questionText": "Scenario: K-Means on noisy sensor data. Best practice?",
        "options": [
          "Use raw data",
          "Increase K arbitrarily",
          "Filter or preprocess noise before clustering",
          "Ignore convergence"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Noise affects centroids and cluster assignment; preprocessing improves results."
      },
      {
        "id": 46,
        "questionText": "Scenario: K-Means for image segmentation. Metric for colors?",
        "options": [
          "Cosine similarity",
          "Euclidean distance in RGB or LAB space",
          "Hamming distance",
          "Jaccard index"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Euclidean distance is standard for numerical pixel features in color space."
      },
      {
        "id": 47,
        "questionText": "Scenario: K-Means convergence to local minimum. Reason?",
        "options": [
          "Poor initialization of centroids",
          "Algorithm always finds global minimum",
          "Clusters are too compact",
          "Distance metric is incorrect"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Random initial centroids can lead K-Means to converge to suboptimal local minima."
      },
      {
        "id": 48,
        "questionText": "Scenario: K-Means clustering with overlapping clusters. Limitation?",
        "options": [
          "Clusters merge automatically",
          "Algorithm adapts perfectly",
          "Cannot clearly separate overlapping clusters",
          "Noise ignored"
        ],
        "correctAnswerIndex": 2,
        "explanation": "K-Means relies on distance; overlapping clusters may not be correctly assigned."
      },
      {
        "id": 49,
        "questionText": "Scenario: K-Means for market segmentation. Use case?",
        "options": [
          "Remove outliers automatically",
          "Identify customer groups for targeted campaigns",
          "Predict stock prices",
          "Visualize time series"
        ],
        "correctAnswerIndex": 1,
        "explanation": "K-Means clusters similar customers to enable targeted marketing strategies."
      },
      {
        "id": 50,
        "questionText": "Scenario: K-Means for anomaly detection in credit card transactions. Approach?",
        "options": [
          "Transactions far from cluster centroids may be fraudulent",
          "All transactions treated equally",
          "Clusters merge automatically",
          "Noise ignored"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Outliers distant from normal clusters can indicate anomalous or fraudulent activity."
      },
      {
        "id": 51,
        "questionText": "Scenario: K-Means on high-dimensional gene expression data. Best practice?",
        "options": [
          "Clusters merge randomly",
          "Use raw high-dimensional data directly",
          "Increase K arbitrarily",
          "Use PCA or dimensionality reduction before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Dimensionality reduction helps meaningful clustering and avoids distance dilution."
      },
      {
        "id": 52,
        "questionText": "Scenario: K-Means on very large dataset. Speed-up technique?",
        "options": [
          "Mini-Batch K-Means",
          "Increase K",
          "Ignore convergence",
          "Use raw data"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Mini-Batch K-Means updates centroids using batches, reducing computation time."
      },
      {
        "id": 53,
        "questionText": "Scenario: K-Means applied to IoT sensor data with missing values. Solution?",
        "options": [
          "Impute missing values before clustering",
          "Remove entire dataset",
          "Assign clusters randomly",
          "Ignore missing values"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means requires complete numerical data; missing values must be handled prior to clustering."
      },
      {
        "id": 54,
        "questionText": "Scenario: K-Means applied to customer purchase history. Challenge?",
        "options": [
          "Clusters automatically balanced",
          "Sparse purchase data may lead to poor cluster separation",
          "Algorithm converges perfectly",
          "Noise ignored"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Sparse or high-dimensional data can reduce clustering accuracy; preprocessing helps."
      },
      {
        "id": 55,
        "questionText": "Scenario: K-Means with categorical features encoded as numbers. Risk?",
        "options": [
          "Algorithm works perfectly",
          "Clusters merge automatically",
          "Noise ignored",
          "Numerical encoding may introduce artificial distance relationships"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Direct numeric encoding of categorical data can misrepresent similarity between categories."
      },
      {
        "id": 56,
        "questionText": "Scenario: K-Means for spatial clustering of stores. Best practice?",
        "options": [
          "Increase K arbitrarily",
          "Clusters merge randomly",
          "Use raw coordinates directly",
          "Normalize coordinates or use appropriate distance metric"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Scaling ensures coordinates are comparable and distance computations are accurate."
      },
      {
        "id": 57,
        "questionText": "Scenario: K-Means produces poor clustering. Possible reason?",
        "options": [
          "Data not suitable for K-Means (non-spherical or overlapping)",
          "Centroids incorrect",
          "Algorithm always finds perfect clusters",
          "Distance metric irrelevant"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means struggles with non-spherical or overlapping clusters."
      },
      {
        "id": 58,
        "questionText": "Scenario: K-Means clustering on scaled features. Advantage?",
        "options": [
          "Distance metric changes",
          "Clusters merge automatically",
          "Prevents dominance by features with large range",
          "Algorithm ignores scaling"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Scaling ensures each feature contributes equally to Euclidean distance calculations."
      },
      {
        "id": 59,
        "questionText": "Scenario: K-Means with clusters of unequal variance. Issue?",
        "options": [
          "Noise ignored",
          "Algorithm automatically adjusts",
          "Clusters always compact",
          "Clusters may not accurately represent data structure"
        ],
        "correctAnswerIndex": 3,
        "explanation": "K-Means assumes similar variance; large differences affect cluster quality."
      },
      {
        "id": 60,
        "questionText": "Scenario: K-Means applied to time-series data. Approach?",
        "options": [
          "Use raw sequences directly",
          "Increase K arbitrarily",
          "Clusters merge randomly",
          "Extract meaningful features before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Feature extraction ensures distance metrics are meaningful for time-series clustering."
      },
      {
        "id": 61,
        "questionText": "Scenario: K-Means clusters overlap. Evaluation metric?",
        "options": [
          "Use SSE only",
          "Clusters merge randomly",
          "Ignore overlap",
          "Silhouette score measures separation and cohesion"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Silhouette score evaluates how well points fit within their clusters vs others."
      },
      {
        "id": 62,
        "questionText": "Scenario: K-Means with too few clusters. Result?",
        "options": [
          "Clusters may merge dissimilar points, reducing interpretability",
          "Algorithm adapts automatically",
          "Clusters always compact",
          "Noise ignored"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Too small K forces dissimilar points into same cluster, reducing accuracy."
      },
      {
        "id": 63,
        "questionText": "Scenario: K-Means for market basket analysis. Limitation?",
        "options": [
          "Algorithm works perfectly",
          "Noise ignored",
          "Clusters merge automatically",
          "Sparse and categorical data requires encoding or alternate methods"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Sparse categorical data needs careful preprocessing or K-Prototypes instead of K-Means."
      },
      {
        "id": 64,
        "questionText": "Scenario: K-Means produces empty clusters. Solution?",
        "options": [
          "Ignore empty clusters",
          "Algorithm fails automatically",
          "Increase K randomly",
          "Reinitialize centroids or reduce K"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Reassigning centroids or reducing K resolves empty clusters."
      },
      {
        "id": 65,
        "questionText": "Scenario: K-Means with high-dimensional data. Challenge?",
        "options": [
          "Distance metrics lose meaning; reduce dimensions",
          "Algorithm adapts automatically",
          "Clusters always accurate",
          "Noise ignored"
        ],
        "correctAnswerIndex": 0,
        "explanation": "High dimensions dilute distances, making clustering unreliable without dimensionality reduction."
      },
      {
        "id": 66,
        "questionText": "Scenario: K-Means on normalized vs unnormalized features. Effect?",
        "options": [
          "Algorithm automatically scales",
          "Normalization ensures fair distance contribution across features",
          "Unnormalized always better",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Normalized features prevent features with large ranges from dominating clustering."
      },
      {
        "id": 67,
        "questionText": "Scenario: K-Means++ vs random initialization. Advantage?",
        "options": [
          "Improves clustering stability and convergence",
          "Random initialization always better",
          "No difference in results",
          "Removes noise automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means++ selects initial centroids to reduce poor local minima."
      },
      {
        "id": 68,
        "questionText": "Scenario: K-Means on customer purchase amounts. Data skewed. Solution?",
        "options": [
          "Use raw data",
          "Log-transform or scale data before clustering",
          "Increase K",
          "Ignore skew"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Transforming skewed data prevents high-value points from dominating clustering."
      },
      {
        "id": 69,
        "questionText": "Scenario: K-Means on text data after TF-IDF. Challenge?",
        "options": [
          "Algorithm works perfectly",
          "High-dimensional sparse vectors; dimensionality reduction recommended",
          "Noise ignored",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Sparse high-dimensional TF-IDF vectors may reduce clustering effectiveness without reduction."
      },
      {
        "id": 70,
        "questionText": "Scenario: K-Means applied to IoT device readings. Best practice?",
        "options": [
          "Increase K randomly",
          "Use raw readings",
          "Ignore convergence",
          "Normalize or scale features to ensure meaningful clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Scaling ensures that features contribute equally to distance calculations for clustering."
      },
      {
        "id": 71,
        "questionText": "Scenario: K-Means clustering applied to gene expression data with thousands of features. Best approach?",
        "options": [
          "Apply PCA or feature selection to reduce dimensionality before clustering",
          "Randomly remove features",
          "Increase K arbitrarily",
          "Use all features directly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "High-dimensional gene data can dilute distances; dimensionality reduction ensures meaningful clusters."
      },
      {
        "id": 72,
        "questionText": "Scenario: K-Means with very large K relative to dataset size. Risk?",
        "options": [
          "Clusters may be meaningless or empty",
          "Algorithm automatically adjusts",
          "Distance metric ignored",
          "Clusters merge automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Too many clusters can lead to tiny or empty clusters with no interpretability."
      },
      {
        "id": 73,
        "questionText": "Scenario: K-Means on data with non-uniform density clusters. Limitation?",
        "options": [
          "Noise ignored",
          "Low-density clusters may merge with high-density ones",
          "Algorithm adjusts automatically",
          "Clusters always compact"
        ],
        "correctAnswerIndex": 1,
        "explanation": "K-Means does not handle varying densities well; denser clusters dominate centroid assignment."
      },
      {
        "id": 74,
        "questionText": "Scenario: K-Means on highly skewed financial transaction data. Best preprocessing?",
        "options": [
          "Apply log transformation to reduce skew before clustering",
          "Use raw data",
          "Clusters merge randomly",
          "Increase K arbitrarily"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Log or other transformations reduce the effect of extreme values, improving clustering quality."
      },
      {
        "id": 75,
        "questionText": "Scenario: K-Means on time-series data. Effective method?",
        "options": [
          "Extract meaningful features such as trends or seasonal components before clustering",
          "Use raw sequences directly",
          "Clusters merge automatically",
          "Increase K randomly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Feature extraction ensures distances reflect meaningful similarities in time-series."
      },
      {
        "id": 76,
        "questionText": "Scenario: K-Means clustering for anomaly detection in network traffic. Strategy?",
        "options": [
          "All points treated equally",
          "Points far from cluster centroids are likely anomalies",
          "Noise ignored",
          "Clusters merge automatically"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Outliers distant from normal traffic clusters are potential anomalies."
      },
      {
        "id": 77,
        "questionText": "Scenario: K-Means applied to image color compression. Challenge?",
        "options": [
          "Algorithm automatically selects K",
          "All clusters identical",
          "Noise ignored",
          "Choosing optimal K to balance compression and image quality"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Selecting K is critical; too few clusters lose color details, too many reduce compression."
      },
      {
        "id": 78,
        "questionText": "Scenario: K-Means++ vs multiple random initializations. Advantage of K-Means++?",
        "options": [
          "Reduces likelihood of poor local minima and improves convergence",
          "Removes noise automatically",
          "Random initializations are better",
          "No difference in results"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means++ selects initial centroids that are distant, improving stability and cluster quality."
      },
      {
        "id": 79,
        "questionText": "Scenario: K-Means applied to sparse TF-IDF text vectors. Best approach?",
        "options": [
          "Use raw sparse vectors directly",
          "Increase K arbitrarily",
          "Reduce dimensionality using techniques like Truncated SVD before clustering",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 2,
        "explanation": "High-dimensional sparse data may produce poor clusters; dimensionality reduction improves performance."
      },
      {
        "id": 80,
        "questionText": "Scenario: K-Means clustering with overlapping spherical clusters. How to improve?",
        "options": [
          "K-Means always works",
          "Clusters merge automatically",
          "Reduce K randomly",
          "Use Gaussian Mixture Models (GMM) for soft clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "GMM can model cluster overlap using probability distributions, unlike hard K-Means assignments."
      },
      {
        "id": 81,
        "questionText": "Scenario: K-Means for customer segmentation with categorical attributes. Best practice?",
        "options": [
          "Use K-Prototypes or encode categories numerically",
          "Ignore categorical data",
          "Use standard K-Means directly",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Prototypes handles mixed numerical and categorical data effectively."
      },
      {
        "id": 82,
        "questionText": "Scenario: K-Means convergence to local minimum. Cause?",
        "options": [
          "Distance metric incorrect",
          "Clusters too compact",
          "Poor or random initialization of centroids",
          "Algorithm always finds global minimum"
        ],
        "correctAnswerIndex": 2,
        "explanation": "K-Means may converge to suboptimal solutions depending on initial centroids."
      },
      {
        "id": 83,
        "questionText": "Scenario: K-Means applied to geospatial clustering. Recommendation?",
        "options": [
          "Increase K randomly",
          "Use appropriate distance metrics like haversine for coordinates",
          "Clusters merge arbitrarily",
          "Use Euclidean distance blindly"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Geographic distances require correct metric to ensure accurate clustering."
      },
      {
        "id": 84,
        "questionText": "Scenario: K-Means with very large datasets. Efficient solution?",
        "options": [
          "Use Mini-Batch K-Means",
          "Increase K arbitrarily",
          "Ignore convergence",
          "Use full dataset only"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Mini-Batch K-Means speeds up computation by using small random batches for centroid updates."
      },
      {
        "id": 85,
        "questionText": "Scenario: K-Means on noisy IoT sensor data. Best preprocessing?",
        "options": [
          "Filter or smooth noise before clustering",
          "Clusters merge automatically",
          "Increase K arbitrarily",
          "Use raw data"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Noise can distort centroids; preprocessing improves clustering reliability."
      },
      {
        "id": 86,
        "questionText": "Scenario: K-Means on very high-dimensional data. Limitation?",
        "options": [
          "Distance metrics lose meaning; dimensionality reduction recommended",
          "Noise ignored",
          "Clusters always accurate",
          "Algorithm adapts automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "High-dimensional spaces dilute distances, leading to poor cluster assignments."
      },
      {
        "id": 87,
        "questionText": "Scenario: K-Means for anomaly detection in healthcare data. Approach?",
        "options": [
          "Noise ignored",
          "Points far from cluster centroids may indicate anomalies",
          "Clusters merge automatically",
          "All points treated equally"
        ],
        "correctAnswerIndex": 1,
        "explanation": "Outliers distant from normal clusters can indicate anomalies or rare events."
      },
      {
        "id": 88,
        "questionText": "Scenario: K-Means on image segmentation with varying illumination. Challenge?",
        "options": [
          "Preprocessing like normalization is needed to reduce lighting effect",
          "Algorithm works perfectly",
          "Increase K randomly",
          "Clusters merge automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Differences in lighting affect pixel values; normalization improves clustering consistency."
      },
      {
        "id": 89,
        "questionText": "Scenario: K-Means for market segmentation with mixed purchase behavior. Solution?",
        "options": [
          "Ignore categorical data",
          "Clusters merge randomly",
          "Use numerical encoding or K-Prototypes for categorical and numerical features",
          "Use standard K-Means directly"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Mixed data requires specialized clustering methods for meaningful segmentation."
      },
      {
        "id": 90,
        "questionText": "Scenario: K-Means clustering produces empty clusters repeatedly. Best solution?",
        "options": [
          "Algorithm fails automatically",
          "Increase K arbitrarily",
          "Ignore empty clusters",
          "Reinitialize centroids or reduce K"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Empty clusters occur when centroids have no assigned points; reinitialization or lowering K resolves this."
      },
      {
        "id": 91,
        "questionText": "Scenario: K-Means applied to highly imbalanced datasets. Issue?",
        "options": [
          "Large clusters may dominate, small clusters underrepresented",
          "Clusters always balanced",
          "Noise ignored",
          "Algorithm adapts automatically"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means does not account for cluster size; imbalance may distort results."
      },
      {
        "id": 92,
        "questionText": "Scenario: K-Means applied to network traffic logs for intrusion detection. Best approach?",
        "options": [
          "Use raw logs directly",
          "Increase K arbitrarily",
          "Preprocess logs into numerical features and detect points far from centroids",
          "Clusters merge automatically"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Transforming logs to numerical vectors enables clustering and anomaly detection."
      },
      {
        "id": 93,
        "questionText": "Scenario: K-Means clustering with multiple valid K values. Evaluation metric?",
        "options": [
          "Silhouette score to evaluate cluster quality",
          "Ignore K selection",
          "Use SSE only",
          "Clusters merge randomly"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Silhouette score measures cohesion and separation, helping choose optimal K."
      },
      {
        "id": 94,
        "questionText": "Scenario: K-Means applied to text clustering using word embeddings. Limitation?",
        "options": [
          "High-dimensional vectors may require dimensionality reduction or normalization",
          "Clusters merge randomly",
          "Algorithm works perfectly",
          "Noise ignored"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Dimensionality reduction and normalization improve clustering accuracy for embeddings."
      },
      {
        "id": 95,
        "questionText": "Scenario: K-Means clustering results vary on repeated runs. Best solution?",
        "options": [
          "Use K-Means++ initialization or multiple runs",
          "Clusters merge randomly",
          "Ignore variations",
          "Increase K arbitrarily"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Better initialization reduces sensitivity to random centroid placement."
      },
      {
        "id": 96,
        "questionText": "Scenario: K-Means on scaled features vs unscaled features. Observation?",
        "options": [
          "Scaling ensures fair contribution of all features to distance calculation",
          "Clusters merge randomly",
          "Algorithm adapts automatically",
          "Scaling is unnecessary"
        ],
        "correctAnswerIndex": 0,
        "explanation": "Without scaling, features with larger ranges dominate cluster assignments."
      },
      {
        "id": 97,
        "questionText": "Scenario: K-Means clustering on overlapping clusters. Alternative?",
        "options": [
          "Reduce K randomly",
          "Clusters merge automatically",
          "Use soft clustering like Gaussian Mixture Models",
          "K-Means handles overlap perfectly"
        ],
        "correctAnswerIndex": 2,
        "explanation": "Soft clustering models allow points to belong probabilistically to multiple clusters."
      },
      {
        "id": 98,
        "questionText": "Scenario: K-Means applied to sensor network data with missing values. Solution?",
        "options": [
          "Impute missing values before clustering",
          "Assign clusters randomly",
          "Remove entire dataset",
          "Ignore missing values"
        ],
        "correctAnswerIndex": 0,
        "explanation": "K-Means requires complete data; missing values must be handled prior to clustering."
      },
      {
        "id": 99,
        "questionText": "Scenario: K-Means on customer behavior data with high variance features. Best approach?",
        "options": [
          "Increase K arbitrarily",
          "Clusters merge randomly",
          "Use raw data",
          "Scale or normalize features to prevent dominance by high-variance features"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Scaling ensures fair contribution of each feature to distance computation."
      },
      {
        "id": 100,
        "questionText": "Scenario: K-Means applied to a large dataset with many outliers. Recommendation?",
        "options": [
          "Increase K arbitrarily",
          "Clusters merge automatically",
          "Use raw data directly",
          "Preprocess to remove or handle outliers before clustering"
        ],
        "correctAnswerIndex": 3,
        "explanation": "Outliers distort centroids; preprocessing ensures meaningful cluster assignments."
      }
    ]
  }