""" Example script to query webhook messages from the dataset. This demonstrates how to load and analyze the batched parquet files. """ from datasets import load_dataset import json import pandas as pd # Load the dataset print("Loading webhook messages dataset...") dataset = load_dataset("assafvayner/webhook-messages", split="train") print(f"Total messages: {len(dataset)}") print(f"\nFirst message:") print("-" * 50) # Convert to pandas for easier querying df = dataset.to_pandas() # Display first message first_msg = df.iloc[0] print(f"Timestamp: {first_msg['timestamp']}") print(f"Event Type: {first_msg['event_type']}") print(f"Scope: {first_msg['scope']}") print(f"\nPayload:") payload = json.loads(first_msg['payload']) print(json.dumps(payload, indent=2)) print("\n" + "=" * 50) print("Summary Statistics:") print("=" * 50) # Event type distribution print("\nEvent Types:") print(df['event_type'].value_counts()) print("\nScope Distribution:") print(df['scope'].value_counts()) # Time range print(f"\nTime Range:") print(f" First message: {df['timestamp'].min()}") print(f" Last message: {df['timestamp'].max()}") # Example: Filter for specific event type print("\n" + "=" * 50) print("Example Query: Find all 'repo' scope events") print("=" * 50) repo_events = df[df['scope'] == 'repo'] print(f"Found {len(repo_events)} events") # Show sample payloads if len(repo_events) > 0: print("\nSample payload:") sample_payload = json.loads(repo_events.iloc[0]['payload']) print(json.dumps(sample_payload, indent=2)[:500] + "...")