Close Menu
    Facebook X (Twitter) Instagram
    Articles Stock
    • Home
    • Technology
    • AI
    • Pages
      • About ArticlesStock — AI & Technology Journalist
      • Contact us
      • Disclaimer For Articles Stock
      • Privacy Policy
      • Terms and Conditions
    Facebook X (Twitter) Instagram
    Articles Stock
    AI

    A Coding Implementation to Discover and Analyze the TaskTrove Dataset with Streaming Parsing Visualization and Verifier Detection

    Naveed AhmadBy Naveed Ahmad04/05/2026No Comments3 Mins Read
    blog 8


    filename_counter: Counter = Counter()
    all_json_keys:    Counter = Counter()
    samples_for_show: Record = []
    
    
    for i, row in enumerate(tqdm(ds_test, desc="inspecting construction", whole=200)):
       if i >= 200:
           break
       p = parse_task(row["task_binary"])
       if p["format"] in ("tar", "zip"):
           for identify, physique in p["files"].gadgets():
               filename_counter[name] += 1
               if identify.endswith(".json") and isinstance(physique, str):
                   strive:
                       obj = json.hundreds(physique)
                       if isinstance(obj, dict):
                           for okay in obj.keys():
                               all_json_keys[k] += 1
                   besides Exception:
                       cross
           if len(samples_for_show) < 2:
               samples_for_show.append((row["path"], p))
    
    
    print("nMost frequent filenames inside activity archives:")
    for identify, n in filename_counter.most_common(15):
       print(f"  {n:>4}  {identify}")
    
    
    print("nMost frequent top-level JSON keys (throughout any *.json):")
    for okay, n in all_json_keys.most_common(20):
       print(f"  {n:>4}  {okay}")
    
    
    if samples_for_show:
       print(f"nFull file itemizing for one pattern activity ({samples_for_show[0][0]}):")
       for identify, physique in samples_for_show[0][1]["files"].gadgets():
           sz = len(physique) if isinstance(physique, (str, bytes)) else 0
           print(f"  {identify}  ({sz:,} B)")
    
    
    
    
    VERIFIER_FILE_PATTERNS = ("verifier", "confirm", "grader", "choose", "rating", "eval")
    VERIFIER_JSON_KEYS     = ("verifier", "verifier_config", "choose", "grader",
                             "rubric", "test_patch", "FAIL_TO_PASS", "checks")
    
    
    
    
    def has_verifier(parsed: Dict[str, Any]) -> bool:
       """Detect verifiers through filename, JSON content material, or each."""
       if parsed["format"] not in ("tar", "zip"):
           c = parsed.get("content material")
           if isinstance(c, dict):
               return any(okay in c for okay in VERIFIER_JSON_KEYS)
           return False
    
    
       recordsdata = parsed["files"]
    
    
       for identify in recordsdata:
           low = identify.decrease()
           if any(pat in low for pat in VERIFIER_FILE_PATTERNS):
               return True
    
    
       for identify, physique in recordsdata.gadgets():
           if identify.endswith((".json", ".yaml", ".yml")) and isinstance(physique, str):
               strive:
                   obj = json.hundreds(physique)
                   if isinstance(obj, dict) and any(okay in obj for okay in VERIFIER_JSON_KEYS):
                       return True
               besides Exception:
                   cross
               low = physique.decrease()
               if "verifier" in low or "test_patch" in low:
                   return True
    
    
       return False
    
    
    
    
    class TaskTroveExplorer:
       """Excessive-level interface to the open-thoughts/TaskTrove dataset."""
    
    
       def __init__(self, break up: str = "take a look at", dataset_id: str = DATASET_ID):
           self.dataset_id = dataset_id
           self.break up = break up
           self._ds = load_dataset(dataset_id, break up=break up, streaming=True)
    
    
       def iter(self, restrict: Non-compulsory[int] = None,
                source_filter: Non-compulsory[str] = None) -> Iterator[Dict[str, Any]]:
           rx = re.compile(source_filter) if source_filter else None
           n = 0
           for row in self._ds:
               if rx and never rx.search(source_of(row["path"])):
                   proceed
               yield row
               n += 1
               if restrict is just not None and n >= restrict:
                   return
    
    
       def pattern(self, n: int = 5,
                  source_filter: Non-compulsory[str] = None) -> Record[Dict[str, Any]]:
           out = []
           for row in self.iter(restrict=n, source_filter=source_filter):
               parsed = parse_task(row["task_binary"])
               parsed["path"] = row["path"]
               parsed["source"] = source_of(row["path"])
               out.append(parsed)
           return out
    
    
       def abstract(self, restrict: int = 1000,
                   source_filter: Non-compulsory[str] = None) -> pd.DataFrame:
           rows = []
           for row in self.iter(restrict=restrict, source_filter=source_filter):
               parsed = parse_task(row["task_binary"])
               rows.append({
                   "supply": source_of(row["path"]),
                   "compressed": parsed["compressed_size"],
                   "uncooked": parsed["raw_size"],
                   "format": parsed["format"],
                   "n_files": len(parsed.get("recordsdata", {})),
                   "has_verifier": has_verifier(parsed),
               })
           df = pd.DataFrame(rows)
           if df.empty:
               return df
           return (df.groupby("supply")
                     .agg(n=("compressed", "depend"),
                          mean_compressed_kb=("compressed", lambda s: s.imply()/1024),
                          mean_raw_kb=("uncooked",                lambda s: s.imply()/1024),
                          mean_n_files=("n_files", "imply"),
                          verifier_rate=("has_verifier", "imply"))
                     .spherical(2)
                     .sort_values("n", ascending=False))
    
    
       @staticmethod
       def has_verifier(parsed: Dict[str, Any]) -> bool:
           return has_verifier(parsed)
    
    
       def export(self, output_dir: Union[str, Path], n: int = 10,
                  source_filter: Non-compulsory[str] = None) -> Path:
           output_dir = Path(output_dir)
           output_dir.mkdir(dad and mom=True, exist_ok=True)
           for parsed in self.pattern(n=n, source_filter=source_filter):
               slug = parsed["path"].change("/", "_")
               tdir = output_dir / slug
               tdir.mkdir(exist_ok=True)
               if parsed["format"] in ("tar", "zip"):
                   for identify, physique in parsed["files"].gadgets():
                       out = tdir / identify
                       out.guardian.mkdir(dad and mom=True, exist_ok=True)
                       if isinstance(physique, str):
                           out.write_text(physique, encoding="utf-8")
                       else:
                           out.write_bytes(physique)
               else:
                   content material = parsed.get("content material", b"")
                   if isinstance(content material, (dict, checklist)):
                       (tdir / "activity.json").write_text(json.dumps(content material, indent=2))
                   elif isinstance(content material, str):
                       (tdir / "activity.txt").write_text(content material)
                   else:
                       (tdir / "activity.bin").write_bytes(content material)
           print(f"✓ exported duties to {output_dir.resolve()}")
           return output_dir
    
    
    
    
    explorer = TaskTroveExplorer(break up="take a look at")
    
    
    print("nSample of three parsed duties:")
    for s in explorer.pattern(n=3):
       print(f"path: {s['path']} | supply: {s['source']} | format: {s['format']} | "
             f"recordsdata: {len(s.get('recordsdata', {}))} | verifier: {has_verifier(s)}")



    Source link

    Naveed Ahmad

    Naveed Ahmad is a technology journalist and AI writer at ArticlesStock, covering artificial intelligence, machine learning, and emerging tech policy. Read his latest articles.

    Related Posts

    A Developer’s Information to Systematic Prompting: Mastering Damaging Constraints, Structured JSON Outputs, and Multi-Speculation Verbalized Sampling

    04/05/2026

    ‘That is superb’ creator says AI startup stole his artwork

    04/05/2026

    In Harvard research, AI provided extra correct diagnoses than emergency room medical doctors

    03/05/2026
    Leave A Reply Cancel Reply

    Categories
    • AI
    Recent Comments
      Facebook X (Twitter) Instagram Pinterest
      © 2026 ThemeSphere. Designed by ThemeSphere.

      Type above and press Enter to search. Press Esc to cancel.