{
  "updatedAt": "2026-05-20T06:05:00.918Z",
  "date": "2026-05-20",
  "count": 60,
  "items": [
    {
      "arxivId": "2605.18739",
      "title": "LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation",
      "summary": "LongLive-2.0 presents an NVFP4-based parallel infrastructure for long video generation that addresses training and inference bottlenecks through sequence-parallel autoregressive training and diffusion model tuning.",
      "authors": [
        "Yukang Chen",
        "Luozhou Wang",
        "Wei Huang",
        "Shuai Yang",
        "Bohan Zhang",
        "Yicheng Xiao"
      ],
      "organization": {
        "_id": "60262b67268c201cdc8b7d43",
        "name": "nvidia",
        "fullname": "NVIDIA",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65df9200dc3292a8983e5017/Vs5FPVCH-VZBipV3qKTuy.png"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 97,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18739.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18739",
      "pdfUrl": "https://arxiv.org/pdf/2605.18739.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18739",
      "githubRepo": "https://github.com/NVlabs/LongLive",
      "githubStars": 1294,
      "keywords": [
        "NVFP4",
        "sequence-parallel autoregressive training",
        "Balanced SP",
        "teacher-forcing layout",
        "VAE encoding",
        "diffusion model"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "LongLive-2.0：基于NVFP4的长视频生成并行架构",
      "summary_zh": "提出基于NVFP4的并行架构，解决长视频生成训练与推理瓶颈，采用序列并行自回归训练和扩散模型调优。",
      "title_i18n": {
        "en": "LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation",
        "zh-CN": "LongLive-2.0：基于NVFP4的长视频生成并行架构",
        "ja": "LongLive-2.0: 長時間動画生成のためのNVFP4並列インフラストラクチャ",
        "ko": "LongLive-2.0: An NVFP4 Parallel Infrastructure for Long Video Generation",
        "es": "LongLive-2.0: Un infraestructura paralela NVFP4 para la generación de videos largos",
        "de": "LongLive-2.0: Ein NVFP4-basiertes paralleles Infrastrukturmodell für die Erstellung langer Videos"
      },
      "summary_i18n": {
        "en": "LongLive-2.0 presents an NVFP4-based parallel infrastructure for long video generation that addresses training and inference bottlenecks through sequence-parallel autoregressive training and diffusion model tuning.",
        "zh-CN": "提出基于NVFP4的并行架构，解决长视频生成训练与推理瓶颈，采用序列并行自回归训练和扩散模型调优。",
        "ja": "LongLive-2.0は、シーケンス並列自己回帰トレーニングと拡散モデルチューニングを通じて、長時間動画生成のためのNVFP4ベースの並列インフラストラクチャを提示しています。",
        "ko": "LongLive-2.0은 시퀀스 병렬 자가 회귀 학습과 확산 모델 튜닝을 통해 장비 영상 생성을 위한 NVFP4 기반 병렬 인프라를 제시합니다.",
        "es": "LongLive-2.0 presenta una infraestructura paralela basada en NVFP4 para la generación de videos largos, abordando cuellos de botella en entrenamiento e inferencia mediante entrenamiento autoregresivo secuencial y ajuste de modelos de difusión.",
        "de": "LongLive-2.0 präsentiert eine NVFP4-basierte parallele Infrastruktur für die Erstellung langer Videos, die Trainings- und Inferenzengpässe durch sequenzparallele autoregressive Trainingsmethoden und Diffusionsmodell-Anpassungen löst."
      }
    },
    {
      "arxivId": "2605.18747",
      "title": "Code as Agent Harness",
      "summary": "Large language models are increasingly used as operational substrates for agent reasoning and execution in agentic systems, with code serving as a unified infrastructure layer across multiple domains and applications.",
      "authors": [
        "Xuying Ning",
        "Katherine Tieu",
        "Dongqi Fu",
        "Tianxin Wei",
        "Zihao Li",
        "Yuanchen Bei"
      ],
      "organization": null,
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 159,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18747.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18747",
      "pdfUrl": "https://arxiv.org/pdf/2605.18747.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18747",
      "githubRepo": "https://github.com/YennNing/Awesome-Code-as-Agent-Harness-Papers",
      "githubStars": 58,
      "keywords": [
        "large language models",
        "agentic systems",
        "agent harness",
        "code as agent harness",
        "agent infrastructure",
        "reasoning"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "代码作为代理的架构",
      "summary_zh": "将代码作为统一基础设施层，用于代理系统中的推理与执行",
      "title_i18n": {
        "en": "Code as Agent Harness",
        "zh-CN": "代码作为代理的架构",
        "ja": "Code as Agent Harness",
        "ko": "Code as Agent Harness",
        "es": "Código como agente",
        "de": "Code as Agent Harness"
      },
      "summary_i18n": {
        "en": "Large language models are increasingly used as operational substrates for agent reasoning and execution in agentic systems, with code serving as a unified infrastructure layer across multiple domains and applications.",
        "zh-CN": "将代码作为统一基础设施层，用于代理系统中的推理与执行",
        "ja": "大規模言語モデルは、エージェントシステムにおける操作的基盤として使用され、コードが複数のドメインとアプリケーションで統一されたインフラストラクチャとして機能しています。",
        "ko": "대규모 언어 모델은 에이전트 시스템에서 운영 기반으로 사용되며, 코드는 다양한 도메인과 애플리케이션의 통합 인프라 계층 역할을 합니다.",
        "es": "Los grandes modelos de lenguaje se utilizan cada vez más como sustrato operativo para razonamiento y ejecución de agentes en sistemas agenciables, con código como capa de infraestructura unificada en múltiples dominios y aplicaciones.",
        "de": "Große Sprachmodelle werden zunehmend als Betriebssubstrat für Agentenreasoning und -ausführung in agentenbasierten Systemen verwendet, wobei Code als einheitliche Infrastruktur über mehrere Domänen hinweg dient."
      }
    },
    {
      "arxivId": "2605.18401",
      "title": "SkillsVote: Lifecycle Governance of Agent Skills from Collection, Recommendation to Evolution",
      "summary": "SkillsVote is a governance framework for long-horizon LLM agents that manages reusable skills through structured collection, recommendation, and evolution processes.",
      "authors": [
        "Hongyi Liu",
        "Haoyan Yang",
        "Tao Jiang",
        "Bo Tang",
        "Feiyu Xiong",
        "Zhiyu Li"
      ],
      "organization": {
        "_id": "658bf18c135580745c529d53",
        "name": "IAAR-Shanghai",
        "fullname": "Memtensor Research Group",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/62a155e615eeab266b2f2243/ONfcDR9Ox8AVtXtrAR4FG.png"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 114,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18401.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18401",
      "pdfUrl": "https://arxiv.org/pdf/2605.18401.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18401",
      "githubRepo": "https://github.com/MemTensor/skills-vote",
      "githubStars": 219,
      "keywords": [
        "Agent Skills",
        "experience schema",
        "executable scripts",
        "skill ecosystems",
        "lifecycle-governance framework",
        "environment requirements"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "SkillsVote：从收集到演化的代理技能生命周期治理",
      "summary_zh": "提出SkillsVote框架，通过结构化收集、推荐和演化过程管理长周期LLM代理的可复用技能",
      "title_i18n": {
        "en": "SkillsVote: Lifecycle Governance of Agent Skills from Collection, Recommendation to Evolution",
        "zh-CN": "SkillsVote：从收集到演化的代理技能生命周期治理",
        "ja": "SkillsVote: エージェントスキルのライフサイクルガバナンス（収集、推薦から進化まで）",
        "ko": "SkillsVote: Lifecycle Governance of Agent Skills from Collection, Recommendation to Evolution",
        "es": "SkillsVote: Gobernanza del ciclo de vida de habilidades de agentes desde la recopilación, recomendación hasta la evolución",
        "de": "SkillsVote: Lebenszyklus-Governance von Agentenfertigkeiten von der Sammlung, Empfehlung bis zur Evolution"
      },
      "summary_i18n": {
        "en": "SkillsVote is a governance framework for long-horizon LLM agents that manages reusable skills through structured collection, recommendation, and evolution processes.",
        "zh-CN": "提出SkillsVote框架，通过结构化收集、推荐和演化过程管理长周期LLM代理的可复用技能",
        "ja": "SkillsVoteは、長期的な視点を持つLLMエージェントのためのガバナンスフレームワークであり、構造的な収集、推薦、進化プロセスを通じて再利用可能なスキルを管理します。",
        "ko": "SkillsVote는 재사용 가능한 기술을 구조화된 수집, 추천 및 진화 과정을 통해 관리하는 장기적 LLM 에이전트의 거버넌스 프레임워크입니다.",
        "es": "SkillsVote es un marco de gobernanza para agentes LLM de largo plazo que gestiona habilidades reutilizables mediante procesos estructurados de recopilación, recomendación y evolución.",
        "de": "SkillsVote ist ein Governance-Framework für langfristige LLM-Agenten, das wiederverwendbare Fertigkeiten durch strukturierte Sammlung, Empfehlung und Evolution verwaltet."
      }
    },
    {
      "arxivId": "2605.12882",
      "title": "CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence",
      "summary": "CiteVQA introduces a benchmark for document vision-language models that evaluates both answer accuracy and correct citation of supporting evidence, revealing significant attribution hallucinations in current models.",
      "authors": [
        "Dongsheng Ma",
        "Jiayu Li",
        "Zhengren Wang",
        "Yijie Wang",
        "Jiahao Kong",
        "Weijun Zeng"
      ],
      "organization": {
        "_id": "66ce9d1f5e180b9b9c8e6f31",
        "name": "opendatalab",
        "fullname": "OpenDataLab",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/639c3afa7432f2f5d16b7296/yqxxBknyeqkGnYsjoaR4M.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 254,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.12882.png",
      "arxivUrl": "https://arxiv.org/abs/2605.12882",
      "pdfUrl": "https://arxiv.org/pdf/2605.12882.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.12882",
      "githubRepo": "https://github.com/opendatalab/CiteVQA",
      "githubStars": 58,
      "keywords": [
        "Multimodal Large Language Models",
        "Doc-VQA",
        "document understanding",
        "bounding-box citations",
        "Strict Attributed Accuracy",
        "Attribution Hallucination"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "CiteVQA：可信文档智能的证据归属基准",
      "summary_zh": "构建CiteVQA基准，评估文档视觉-语言模型的答案准确性和引用证据正确性，揭示当前模型存在显著的归属幻觉。",
      "title_i18n": {
        "en": "CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence",
        "zh-CN": "CiteVQA：可信文档智能的证据归属基准",
        "ja": "CiteVQA: 信頼性のあるドキュメントインテリジェンスのための証拠属性評価ベンチマーク",
        "ko": "CiteVQA: Benchmarking Evidence Attribution for Trustworthy Document Intelligence",
        "es": "CiteVQA: Benchmark para la atribución de evidencia en inteligencia documental confiable",
        "de": "CiteVQA: Benchmarking der Beweiszuordnung für vertrauenswürdige Dokumentenintelligenz"
      },
      "summary_i18n": {
        "en": "CiteVQA introduces a benchmark for document vision-language models that evaluates both answer accuracy and correct citation of supporting evidence, revealing significant attribution hallucinations in current models.",
        "zh-CN": "构建CiteVQA基准，评估文档视觉-语言模型的答案准确性和引用证据正确性，揭示当前模型存在显著的归属幻觉。",
        "ja": "CiteVQAは、文書ビジョン言語モデルのためのベンチマークを導入し、回答の正確さとサポート証拠の正しい引用を評価し、現在のモデルに重大な引用幻覚があることを明らかにしています。",
        "ko": "CiteVQA는 문서 비전-언어 모델에 대한 증거 인용 평가를 위한 벤치마크를 제안하며, 현재 모델의 인용 환상 문제를 드러냅니다.",
        "es": "CiteVQA introduce un benchmark para modelos de visión-lenguaje documental que evalúa tanto la precisión de las respuestas como la correcta citación de la evidencia, revelando importantes alucinaciones en los modelos actuales.",
        "de": "CiteVQA führt einen Benchmark für Dokumenten-Vision-Language-Modelle ein, der Antwortgenauigkeit und korrekte Zitierung von Beweisen bewertet und erhebliche Halluzinationen bei aktuellen Modellen aufdeckt."
      }
    },
    {
      "arxivId": "2605.13779",
      "title": "MinT: Managed Infrastructure for Training and Serving Millions of LLMs",
      "summary": "MinT is a managed infrastructure system that enables efficient low-rank adaptation training and serving by keeping base models resident and moving lightweight adapter revisions, scaling across multiple dimensions including large model architectures, reduced storage requirements, and distributed policy management.",
      "authors": [
        "Mind Lab",
        "Song Cao",
        "Vic Cao",
        "Andrew Chen",
        "Kaijie Chen",
        "Cleon Cheng"
      ],
      "organization": {
        "_id": "69d05e49e99e437a3b18bebc",
        "name": "mindlab-research",
        "fullname": "Mind Lab",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/64303a4484f3ed1ce62a2c5a/0L3sMuvL2JGls3zUKOKYU.jpeg"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 214,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13779.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13779",
      "pdfUrl": "https://arxiv.org/pdf/2605.13779.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13779",
      "githubRepo": "https://github.com/MindLab-Research/mindlab-toolkit",
      "githubStars": 35,
      "keywords": [
        "Low-Rank Adaptation",
        "LoRA",
        "post-training",
        "online serving",
        "base-model deployments",
        "full checkpoint"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "MinT：用于训练和部署数百万LLM的管理基础设施",
      "summary_zh": "MinT通过保留基础模型并移动轻量适配器，实现高效低秩适应训练与服务，支持大规模模型架构和分布式策略管理。",
      "title_i18n": {
        "en": "MinT: Managed Infrastructure for Training and Serving Millions of LLMs",
        "zh-CN": "MinT：用于训练和部署数百万LLM的管理基础设施",
        "ja": "MinT: 何百万ものLLMのトレーニングとサービスのためのマネージドインフラストラクチャ",
        "ko": "MinT: Managed Infrastructure for Training and Serving Millions of LLMs",
        "es": "MinT: Infraestructura gestionada para entrenar y servir millones de LLMs",
        "de": "MinT: Verwaltete Infrastruktur für das Training und Bereitstellen von Millionen von LLMs"
      },
      "summary_i18n": {
        "en": "MinT is a managed infrastructure system that enables efficient low-rank adaptation training and serving by keeping base models resident and moving lightweight adapter revisions, scaling across multiple dimensions including large model architectures, reduced storage requirements, and distributed policy management.",
        "zh-CN": "MinT通过保留基础模型并移动轻量适配器，实现高效低秩适应训练与服务，支持大规模模型架构和分布式策略管理。",
        "ja": "MinTは、ベースモデルを常駐させ、軽量アダプタの更新を移動することで、効率的な低ランク適応トレーニングとサービスを可能にするマネージドインフラストラクチャシステムです。",
        "ko": "MinT는 기본 모델을 유지하고 가벼운 어댑터 수정을 이동하여 효율적인 저단계 적응 학습 및 서비스를 가능하게 하는 관리형 인프라 시스템입니다.",
        "es": "MinT es un sistema de infraestructura gestionada que permite un entrenamiento y servicio eficientes de adaptación de bajo rango manteniendo modelos base residentes y moviendo revisiones ligeras de adaptadores, escalando en múltiples dimensiones.",
        "de": "MinT ist ein verwalteter Infrastruktursystem, das effizientes Low-Rank-Adaptierungstraining und -Bereitstellung ermöglicht, indem Basismodelle resident halten und leichte Adapter-Revisionen verschieben."
      }
    },
    {
      "arxivId": "2605.13301",
      "title": "Achieving Gold-Medal-Level Olympiad Reasoning via Simple and Unified Scaling",
      "summary": "A systematic approach transforms post-trained reasoning models into rigorous olympiad-level solvers through reverse-perplexity curriculum, two-stage reinforcement learning, and test-time scaling, achieving gold-medal performance on mathematical and physics competitions.",
      "authors": [
        "Yafu Li",
        "Runzhe Zhan",
        "Haoran Zhang",
        "Shunkai Zhang",
        "Yizhuo Li",
        "Zhilin Wang"
      ],
      "organization": null,
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 152,
      "comments": 4,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13301.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13301",
      "pdfUrl": "https://arxiv.org/pdf/2605.13301.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13301",
      "githubRepo": "https://github.com/Simplified-Reasoning/SU-01",
      "githubStars": 80,
      "keywords": [
        "reasoning models",
        "mathematical problem solving",
        "scientific problem solving",
        "International Mathematical Olympiad",
        "International Physics Olympiad",
        "reverse-perplexity curriculum"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "通过简单统一的扩展实现金牌级奥数推理",
      "summary_zh": "采用逆困惑度课程、两阶段强化学习和测试时扩展，将后训练推理模型转化为严谨的奥数解题器，达到金牌级别表现。",
      "title_i18n": {
        "en": "Achieving Gold-Medal-Level Olympiad Reasoning via Simple and Unified Scaling",
        "zh-CN": "通过简单统一的扩展实现金牌级奥数推理",
        "ja": "単純で統一的なスケーリングにより金メダルレベルのオリンピック問題解決を実現する",
        "ko": "Achieving Gold-Medal-Level Olympiad Reasoning via Simple and Unified Scaling",
        "es": "Lograr razonamiento de nivel medalla de oro en olimpiadas mediante escalado simple y unificado",
        "de": "Erreichen von Goldmedalien-Level-Olympiade-Reasoning durch einfaches und einheitliches Skalieren"
      },
      "summary_i18n": {
        "en": "A systematic approach transforms post-trained reasoning models into rigorous olympiad-level solvers through reverse-perplexity curriculum, two-stage reinforcement learning, and test-time scaling, achieving gold-medal performance on mathematical and physics competitions.",
        "zh-CN": "采用逆困惑度课程、两阶段强化学习和测试时扩展，将后训练推理模型转化为严谨的奥数解题器，达到金牌级别表现。",
        "ja": "逆パープレキシティカリキュラム、二段階強化学習、テスト時のスケーリングを通じて、後トレーニングされた推論モデルを厳密なオリンピックレベルのソルバーに変換する体系的なアプローチです。",
        "ko": "역전 퍼플렉서티 커리큘럼, 이단계 강화 학습 및 테스트 시간 스케일링을 통해 간단한 단일 확장을 통해 올림픽 수준의 문제 해결 능력을 달성합니다.",
        "es": "Un enfoque sistemático transforma modelos de razonamiento post-entrenados en resolutores rigurosos de olimpiadas mediante currículo de perplejidad inversa, aprendizaje por refuerzo en dos etapas y escalado en tiempo de prueba.",
        "de": "Ein systematischer Ansatz verwandelt nachtrainierte Reasoning-Modelle in rigorose Olympiade-Lösungen durch Reverse-Perplexity-Kurrikulum, zwei-stufiges Verstärkungslernen und Testzeit-Skalierung."
      }
    },
    {
      "arxivId": "2605.18678",
      "title": "Lance: Unified Multimodal Modeling by Multi-Task Synergy",
      "summary": "Lance is a unified multimodal model that combines understanding, generation, and editing capabilities for images and videos through collaborative multi-task training and a dual-stream architecture.",
      "authors": [
        "Fengyi Fu",
        "Mengqi Huang",
        "Shaojin Wu",
        "Yunsheng Jiang",
        "Yufei Huo",
        "Hao Li"
      ],
      "organization": {
        "_id": "66bc9cc55b64185087601c60",
        "name": "bytedance-research",
        "fullname": "bytedance-research",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6535c9e88bde2fae19b6fb25/7a1zq0juEwFJVCIShnLI-.png"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 62,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18678.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18678",
      "pdfUrl": "https://arxiv.org/pdf/2605.18678.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18678",
      "githubRepo": "https://github.com/bytedance/Lance",
      "githubStars": 314,
      "keywords": [
        "mixture-of-experts architecture",
        "multimodal understanding",
        "multimodal generation",
        "multimodal editing",
        "collaborative multi-task training",
        "unified context modeling"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Lance：通过多任务协同的统一多模态建模",
      "summary_zh": "构建Lance模型，通过多任务协同和双流架构实现图像与视频的理解、生成与编辑能力",
      "title_i18n": {
        "en": "Lance: Unified Multimodal Modeling by Multi-Task Synergy",
        "zh-CN": "Lance：通过多任务协同的统一多模态建模",
        "ja": "Lance: 多タスク協調による統合マルチモーダルモデリング",
        "ko": "Lance: Unified Multimodal Modeling by Multi-Task Synergy",
        "es": "Lance: Modelado multimodal unificado mediante síntesis multitarea",
        "de": "Lance: Einheitliche Multimodalmodellierung durch Multi-Aufgaben-Synergie"
      },
      "summary_i18n": {
        "en": "Lance is a unified multimodal model that combines understanding, generation, and editing capabilities for images and videos through collaborative multi-task training and a dual-stream architecture.",
        "zh-CN": "构建Lance模型，通过多任务协同和双流架构实现图像与视频的理解、生成与编辑能力",
        "ja": "Lanceは、コラボレーティブな多タスクトレーニングとデュアルストリームアーキテクチャを通じて、画像と動画の理解、生成、編集の能力を統合したマルチモーダルモデルです。",
        "ko": "Lance는 다중 작업 협업 학습과 이중 스트림 아키텍처를 통해 이미지 및 동영상의 이해, 생성 및 편집 기능을 결합한 통합 멀티모달 모델입니다.",
        "es": "Lance es un modelo multimodal unificado que combina capacidades de comprensión, generación y edición para imágenes y videos mediante entrenamiento colaborativo multitarea y arquitectura de doble flujo.",
        "de": "Lance ist ein einheitliches Multimodalmodell, das Verständnis, Generierung und Bearbeitungsfähigkeiten für Bilder und Videos durch kooperative Multi-Aufgaben-Training und eine Dual-Stream-Architektur kombiniert."
      }
    },
    {
      "arxivId": "2605.13724",
      "title": "AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation",
      "summary": "AnyFlow introduces a novel any-step video diffusion distillation framework that improves upon consistency distillation by optimizing full ODE sampling trajectories through flow-map transition learning and backward simulation techniques.",
      "authors": [
        "Yuchao Gu",
        "Guian Fang",
        "Yuxin Jiang",
        "Weijia Mao",
        "Song Han",
        "Han Cai"
      ],
      "organization": {
        "_id": "60262b67268c201cdc8b7d43",
        "name": "nvidia",
        "fullname": "NVIDIA",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65df9200dc3292a8983e5017/Vs5FPVCH-VZBipV3qKTuy.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 95,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13724.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13724",
      "pdfUrl": "https://arxiv.org/pdf/2605.13724.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13724",
      "githubRepo": "https://github.com/NVlabs/AnyFlow",
      "githubStars": 291,
      "keywords": [
        "consistency distillation",
        "video generation",
        "flow maps",
        "ODE sampling",
        "Euler rollout",
        "on-policy distillation"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "AnyFlow：基于策略流图蒸馏的任意步骤视频扩散模型",
      "summary_zh": "AnyFlow通过流图转移学习和反向模拟优化全ODE采样轨迹，提升一致性蒸馏效果。",
      "title_i18n": {
        "en": "AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation",
        "zh-CN": "AnyFlow：基于策略流图蒸馏的任意步骤视频扩散模型",
        "ja": "AnyFlow: オンポリシー・フローマップ蒸留による任意ステップ動画拡散モデル",
        "ko": "AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation",
        "es": "AnyFlow: Modelo de difusión de video de cualquier paso con distilación de mapa de flujo en política",
        "de": "AnyFlow: Any-Step-Video-Diffusionsmodell mit On-Policy-Flow-Map-Distillation"
      },
      "summary_i18n": {
        "en": "AnyFlow introduces a novel any-step video diffusion distillation framework that improves upon consistency distillation by optimizing full ODE sampling trajectories through flow-map transition learning and backward simulation techniques.",
        "zh-CN": "AnyFlow通过流图转移学习和反向模拟优化全ODE采样轨迹，提升一致性蒸馏效果。",
        "ja": "AnyFlowは、フローマップ遷移学習とバックワードシミュレーション技術を通じて、ODEサンプリングトラジェクトリを最適化することによって、一貫性蒸留を改善する新しい任意ステップ動画拡散蒸留フレームワークを導入します。",
        "ko": "AnyFlow는 흐름 지도 전이 학습 및 역방향 시뮬레이션 기법을 통해 전체 ODE 샘플링 경로를 최적화하는 새로운 any-step 동영상 확산 흡수 프레임워크를 소개합니다.",
        "es": "AnyFlow introduce un marco de distilación de difusión de video de cualquier paso que mejora la consistencia mediante optimización de trayectorias completas de muestreo ODE a través de aprendizaje de transición de mapa de flujo y simulación hacia atrás.",
        "de": "AnyFlow führt einen neuen Any-Step-Video-Diffusionsdistillationsrahmen ein, der durch Optimierung vollständiger ODE-Sampling-Wege durch Flow-Map-Übergangslernen und Rückwärts-Simulationstechniken verbessert wird."
      }
    },
    {
      "arxivId": "2605.13527",
      "title": "MMSkills: Towards Multimodal Skills for General Visual Agents",
      "summary": "Multimodal procedural knowledge frameworks enable visual agents to leverage external reusable skills through structured representations combining text, state cards, and visual keyframes, improving decision-making in complex environments.",
      "authors": [
        "Kangning Zhang",
        "Shuai Shao",
        "Qingyao Li",
        "Jianghao Lin",
        "Lingyue Fu",
        "Shijian Wang"
      ],
      "organization": {
        "_id": "63ec8ce89d77b7eb70568340",
        "name": "ShanghaiJiaotongUniversity",
        "fullname": "Shanghai Jiaotong University 1(NOT OFFICIAL)",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/63ec8c599d77b7eb70567d94/aD8jb0IbftwEH_V1kffGG.jpeg"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 111,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13527.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13527",
      "pdfUrl": "https://arxiv.org/pdf/2605.13527.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13527",
      "githubRepo": "https://github.com/DeepExperience/MMSkills",
      "githubStars": 125,
      "keywords": [
        "multimodal procedural knowledge",
        "visual agents",
        "skill packages",
        "state-conditioned packages",
        "visual grounding",
        "agentic trajectory-to-skill Generator"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "MMSkills：面向通用视觉代理的多模态技能",
      "summary_zh": "构建多模态程序知识框架，通过文本、状态卡和视觉关键帧提升视觉代理决策能力",
      "title_i18n": {
        "en": "MMSkills: Towards Multimodal Skills for General Visual Agents",
        "zh-CN": "MMSkills：面向通用视觉代理的多模态技能",
        "ja": "MMSkills: Towards Multimodal Skills for General Visual Agents",
        "ko": "MMSkills: Towards Multimodal Skills for General Visual Agents",
        "es": "MMSkills: Towards Multimodal Skills for General Visual Agents",
        "de": "MMSkills: Towards Multimodal Skills for General Visual Agents"
      },
      "summary_i18n": {
        "en": "Multimodal procedural knowledge frameworks enable visual agents to leverage external reusable skills through structured representations combining text, state cards, and visual keyframes, improving decision-making in complex environments.",
        "zh-CN": "构建多模态程序知识框架，通过文本、状态卡和视觉关键帧提升视觉代理决策能力",
        "ja": "マルチモーダルプロシジャル知識フレームワークは、視覚エージェントが構造化された表現を通じて外部の再利用可能なスキルを活用できるようにし、複雑な環境での意思決定を向上させる。",
        "ko": "다중모달 절차 지식 프레임워크는 시각 에이전트가 구조화된 표현을 통해 외부 재사용 가능한 기술을 활용하게 하여 복잡한 환경에서 의사결정을 개선합니다.",
        "es": "Los marcos de conocimiento procedural multimodal permiten a los agentes visuales aprovechar habilidades reutilizables externas mediante representaciones estructuradas que combinan texto, tarjetas de estado y fotogramas visuales, mejorando la toma de decisiones en entornos complejos.",
        "de": "Multimodale Prozedurwissen-Rahmenwerke ermöglichen es visuellen Agenten, externe wiederverwendbare Fähigkeiten durch strukturierte Darstellungen zu nutzen, die Text, Zustandskarten und visuelle Schlüsselbilder kombinieren."
      }
    },
    {
      "arxivId": "2605.15155",
      "title": "Self-Distilled Agentic Reinforcement Learning",
      "summary": "SDAR enhances reinforcement learning for multi-turn agent training by integrating self-distillation through a sigmoid gate that selectively strengthens positive token-level guidance while mitigating negative teacher rejections.",
      "authors": [
        "Zhengxi Lu",
        "Zhiyuan Yao",
        "Zhuowen Han",
        "Zi-Han Wang",
        "Jinyang Wu",
        "Qi Gu"
      ],
      "organization": null,
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 100,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15155.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15155",
      "pdfUrl": "https://arxiv.org/pdf/2605.15155.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15155",
      "githubRepo": "https://github.com/ZJU-REAL/SDAR",
      "githubStars": 104,
      "keywords": [
        "Reinforcement learning",
        "on-policy self-distillation",
        "token-level guidance",
        "teacher branch",
        "privileged context",
        "multi-turn agents"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "自蒸馏代理强化学习",
      "summary_zh": "通过Sigmoid门控实现自蒸馏，提升多轮代理训练的强化学习效果",
      "title_i18n": {
        "en": "Self-Distilled Agentic Reinforcement Learning",
        "zh-CN": "自蒸馏代理强化学习",
        "ja": "Self-Distilled Agentic Reinforcement Learning",
        "ko": "Self-Distilled Agentic Reinforcement Learning",
        "es": "Self-Distilled Agentic Reinforcement Learning",
        "de": "Self-Distilled Agentic Reinforcement Learning"
      },
      "summary_i18n": {
        "en": "SDAR enhances reinforcement learning for multi-turn agent training by integrating self-distillation through a sigmoid gate that selectively strengthens positive token-level guidance while mitigating negative teacher rejections.",
        "zh-CN": "通过Sigmoid门控实现自蒸馏，提升多轮代理训练的强化学习效果",
        "ja": "SDARは、シグモイドゲートを通じた自己蒸留を統合して、多ターンエージェントトレーニングの強化学習を向上させ、ポジティブなトークンレベルの指導を強化し、ネガティブな教師の拒否を軽減する。",
        "ko": "SDAR은 시그모이드 게이트를 통한 자기 교육을 통합하여 다단계 에이전트 훈련의 강화 학습을 향상시킵니다.",
        "es": "SDAR mejora el aprendizaje por refuerzo para entrenamiento de agentes de múltiples turnos integrando auto-distilación mediante una compuerta sigmoide que fortalece selectivamente la guía a nivel de token positivo y mitiga las rechazos del profesor negativo.",
        "de": "SDAR verbessert das Verstärkungslernen für mehrschrittige Agententraining durch Selbst-Distillation mit einem Sigmoid-Gatter, das positive Token-Ebene-Anleitungen verstärkt."
      }
    },
    {
      "arxivId": "2605.15298",
      "title": "PhysBrain 1.0 Technical Report",
      "summary": "PhysBrain 1.0 leverages human egocentric video to generate physical commonsense supervision for vision-language-action models, achieving state-of-the-art performance in embodied control tasks through capability-preserving adaptation.",
      "authors": [
        "Shijie Lian",
        "Bin Yu",
        "Xiaopeng Lin",
        "Changti Wu",
        "Hang Yuan",
        "Xiaolin Hu"
      ],
      "organization": {
        "_id": "6948d884070dda0c2ae35a78",
        "name": "DeepCybo",
        "fullname": "DeepCybo",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65ec01fd770aa0e25d9374dc/QOsz6P_7AxyqGrjsRHTGk.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 136,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15298.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15298",
      "pdfUrl": "https://arxiv.org/pdf/2605.15298.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15298",
      "githubRepo": "https://github.com/Phys-Brain/PhysBrain-VLA",
      "githubStars": 18,
      "keywords": [
        "vision-language-action models",
        "physical commonsense supervision",
        "multimodal QA benchmarks",
        "embodied control benchmarks",
        "VLA policies",
        "capability-preserving adaptation"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "PhysBrain 1.0 技术报告",
      "summary_zh": "利用人类第一视角视频生成物理常识监督，提升视觉-语言-动作模型在具身控制任务中的性能。",
      "title_i18n": {
        "en": "PhysBrain 1.0 Technical Report",
        "zh-CN": "PhysBrain 1.0 技术报告",
        "ja": "PhysBrain 1.0 Technical Report",
        "ko": "PhysBrain 1.0 Technical Report",
        "es": "Informe Técnico de PhysBrain 1.0",
        "de": "PhysBrain 1.0 Technical Report"
      },
      "summary_i18n": {
        "en": "PhysBrain 1.0 leverages human egocentric video to generate physical commonsense supervision for vision-language-action models, achieving state-of-the-art performance in embodied control tasks through capability-preserving adaptation.",
        "zh-CN": "利用人类第一视角视频生成物理常识监督，提升视觉-语言-动作模型在具身控制任务中的性能。",
        "ja": "PhysBrain 1.0は人間のエゴセントリックビデオを活用し、ビジョン-言語-アクションモデルに物理的な共通認識の監督を生成し、能力保持型の適応により、実体化制御タスクで最優秀のパフォーマンスを達成する。",
        "ko": "PhysBrain 1.0은 인간 중심 영상을 사용하여 물리적 공통지식 감독을 생성하고, 몸에 맞는 적응을 통해 최신 성능을 달성합니다.",
        "es": "PhysBrain 1.0 utiliza videos egocéntricos humanos para generar supervisión de sentido físico común para modelos de lenguaje-visión-acción, logrando un rendimiento de vanguardia en tareas de control encarnado mediante adaptación preservadora de capacidades.",
        "de": "PhysBrain 1.0 nutzt menschliche perspektivische Videos, um physisches Alltagswissen für Vision-Language-Action-Modelle zu generieren, und erreicht führende Leistungen in embodied control Tasks."
      }
    },
    {
      "arxivId": "2605.18661",
      "title": "AI for Auto-Research: Roadmap & User Guide",
      "summary": "AI systems demonstrate varying reliability across research stages, excelling in structured tasks but struggling with novel ideas and scientific judgment, necessitating human oversight for credible outcomes.",
      "authors": [
        "Lingdong Kong",
        "Xian Sun",
        "Wei Chow",
        "Linfeng Li",
        "Kevin Qinghong Lin",
        "Xuan Billy Zhang"
      ],
      "organization": null,
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 57,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18661.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18661",
      "pdfUrl": "https://arxiv.org/pdf/2605.18661.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18661",
      "githubRepo": "https://github.com/worldbench/awesome-ai-auto-research",
      "githubStars": 61,
      "keywords": [
        "AI-assisted research",
        "automated systems",
        "research papers",
        "long-horizon agents",
        "scientific integrity",
        "epistemological phases"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "AI用于自动研究：路线图与用户指南",
      "summary_zh": "利用GPT和LoRA提升研究可靠性，优化结构化任务并增强科学判断能力。",
      "title_i18n": {
        "en": "AI for Auto-Research: Roadmap & User Guide",
        "zh-CN": "AI用于自动研究：路线图与用户指南",
        "ja": "AI for Auto-Research: Roadmap & User Guide",
        "ko": "AI for Auto-Research: Roadmap & User Guide",
        "es": "IA para Auto-Investigación: Mapa de ruta y Guía del Usuario",
        "de": "AI for Auto-Research: Roadmap & User Guide"
      },
      "summary_i18n": {
        "en": "AI systems demonstrate varying reliability across research stages, excelling in structured tasks but struggling with novel ideas and scientific judgment, necessitating human oversight for credible outcomes.",
        "zh-CN": "利用GPT和LoRA提升研究可靠性，优化结构化任务并增强科学判断能力。",
        "ja": "AIシステムは研究段階において信頼性が異なる。構造化されたタスクでは優れているが、新しいアイデアや科学的判断には苦手で、信頼できる結果を得るために人間の監視が必要である。",
        "ko": "AI 시스템은 연구 단계에서 신뢰도가 다양하며, 구조화된 작업에서는 우수하지만 새로운 아이디어와 과학적 판단에서는 어려움을 겪습니다.",
        "es": "Los sistemas de IA muestran confiabilidad variable en etapas de investigación, destacando en tareas estructuradas pero teniendo dificultades con ideas novedosas y juicio científico, requiriendo supervisión humana para resultados creíbles.",
        "de": "KI-Systeme zeigen unterschiedliche Zuverlässigkeit über Forschungsphasen hinweg, sind in strukturierten Aufgaben gut, aber bei neuen Ideen und wissenschaftlicher Urteilsbildung eingeschränkt."
      }
    },
    {
      "arxivId": "2605.13841",
      "title": "EVA-Bench: A New End-to-end Framework for Evaluating Voice Agents",
      "summary": "EVA-Bench presents a comprehensive evaluation framework for voice agents that simulates realistic conversations and measures performance across multiple voice-specific failure modes using novel accuracy and experience metrics.",
      "authors": [
        "Tara Bogavelli",
        "Gabrielle Gauthier Melançon",
        "Katrina Stankiewicz",
        "Oluwanifemi Bamgbose",
        "Fanny Riols",
        "Hoang H. Nguyen"
      ],
      "organization": {
        "_id": "65f4df5de83b55da5d79fbb6",
        "name": "ServiceNow-AI",
        "fullname": "ServiceNow-AI",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/63d3095c2727d7888cbb54e2/Uv-Lx8PVGviqokfOyYlCN.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 61,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13841.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13841",
      "pdfUrl": "https://arxiv.org/pdf/2605.13841.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13841",
      "githubRepo": "https://github.com/ServiceNow/eva",
      "githubStars": 123,
      "keywords": [
        "voice agents",
        "bot-to-bot audio conversations",
        "multi-turn dialogues",
        "user simulator",
        "automatic simulation validation",
        "composite metrics"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "EVA-Bench：一种新的端到端语音助手评估框架",
      "summary_zh": "提出EVA-Bench框架，通过模拟真实对话评估语音助手性能，使用新指标衡量准确性和体验",
      "title_i18n": {
        "en": "EVA-Bench: A New End-to-end Framework for Evaluating Voice Agents",
        "zh-CN": "EVA-Bench：一种新的端到端语音助手评估框架",
        "ja": "EVA-Bench: A New End-to-end Framework for Evaluating Voice Agents",
        "ko": "EVA-Bench: A New End-to-end Framework for Evaluating Voice Agents",
        "es": "EVA-Bench: Un Nuevo Marco End-to-End para Evaluar Agentes de Voz",
        "de": "EVA-Bench: A New End-to-end Framework for Evaluating Voice Agents"
      },
      "summary_i18n": {
        "en": "EVA-Bench presents a comprehensive evaluation framework for voice agents that simulates realistic conversations and measures performance across multiple voice-specific failure modes using novel accuracy and experience metrics.",
        "zh-CN": "提出EVA-Bench框架，通过模拟真实对话评估语音助手性能，使用新指标衡量准确性和体验",
        "ja": "EVA-Benchは、音声エージェントのための包括的な評価フレームワークを提示し、現実的な会話をシミュレートし、音声固有の失敗モードを測定する新しい正確性と経験メトリクスを使用する。",
        "ko": "EVA-Bench는 실제 대화를 시뮬레이션하고 음성 특수 실패 모드를 측정하는 종합 평가 프레임워크를 제시합니다.",
        "es": "EVA-Bench presenta un marco de evaluación integral para agentes de voz que simula conversaciones realistas y mide el desempeño en múltiples modos de falla específicos de voz usando métricas de precisión y experiencia innovadoras.",
        "de": "EVA-Bench präsentiert ein umfassendes Bewertungsrahmenwerk für Sprachagenten, das realistische Gespräche simuliert und Leistung anhand von neuartigen Genauigkeits- und Erfahrungsmetriken misst."
      }
    },
    {
      "arxivId": "2605.10912",
      "title": "WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation",
      "summary": "WildClawBench evaluates language and vision-language models on realistic long-horizon tasks using actual CLI environments with real tools instead of synthetic sandboxes.",
      "authors": [
        "Shuangrui Ding",
        "Xuanlang Dai",
        "Long Xing",
        "Shengyuan Ding",
        "Ziyu Liu",
        "Yang JingYi"
      ],
      "organization": {
        "_id": "64a2d5fa81252883206f24c9",
        "name": "internlm",
        "fullname": "Intern Large Models",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6432683407bad11484a68457/Q3Y0dL79GcsnaBCGRMooZ.png"
      },
      "publishedAt": "2026-05-11T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 45,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.10912.png",
      "arxivUrl": "https://arxiv.org/abs/2605.10912",
      "pdfUrl": "https://arxiv.org/pdf/2605.10912.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.10912",
      "githubRepo": "https://github.com/internlm/WildClawBench",
      "githubStars": 388,
      "keywords": [
        "command-line interface",
        "multimodal tasks",
        "Docker container",
        "LLM/VLM judge",
        "semantic verification",
        "tool calls"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "WildClawBench：真实长周期代理评估基准",
      "summary_zh": "构建WildClawBench评估语言与视觉语言模型在真实CLI环境中的长周期任务表现",
      "title_i18n": {
        "en": "WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation",
        "zh-CN": "WildClawBench：真实长周期代理评估基准",
        "ja": "WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation",
        "ko": "WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation",
        "es": "WildClawBench: Un Benchmark para la Evaluación de Agentes de Alto Horizonte en el Mundo Real",
        "de": "WildClawBench: A Benchmark for Real-World, Long-Horizon Agent Evaluation"
      },
      "summary_i18n": {
        "en": "WildClawBench evaluates language and vision-language models on realistic long-horizon tasks using actual CLI environments with real tools instead of synthetic sandboxes.",
        "zh-CN": "构建WildClawBench评估语言与视觉语言模型在真实CLI环境中的长周期任务表现",
        "ja": "WildClawBenchは、実際のCLI環境と実際のツールを使用して、現実的な長期的なタスクにおける言語およびビジョン-言語モデルを評価するベンチマークである。",
        "ko": "WildClawBench는 실제 CLI 환경과 도구를 사용하여 현실적인 장기 목표 에이전트 평가를 수행합니다.",
        "es": "WildClawBench evalúa modelos de lenguaje y lenguaje-vision en tareas de alto horizonte real utilizando entornos de línea de comandos reales con herramientas reales en lugar de sandbox sintéticos.",
        "de": "WildClawBench bewertet Sprach- und Vision-Sprach-Modelle auf realistischen langfristigen Aufgaben mit echten CLI-Umgebungen und echten Tools statt synthetischen Sandkästen."
      }
    },
    {
      "arxivId": "2605.15824",
      "title": "FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization",
      "summary": "FashionChameleon enables real-time interactive multi-garment video customization through teacher-student distillation and in-context learning techniques while maintaining motion coherence.",
      "authors": [
        "Quanjian Song",
        "Yefeng Shen",
        "Mengting Chen",
        "Hao Sun",
        "Jinsong Lan",
        "Xiaoyong Zhu"
      ],
      "organization": {
        "_id": "64488b334988ee01f2a8d856",
        "name": "alibaba-inc",
        "fullname": "alibaba-inc",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/61ac8f8a00d01045fca0ad2f/MX4wxQVaFm1A1wqnrL2WU.jpeg"
      },
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 54,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15824.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15824",
      "pdfUrl": "https://arxiv.org/pdf/2605.15824.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15824",
      "githubRepo": "https://github.com/quanjiansong/FashionChameleon",
      "githubStars": 79,
      "keywords": [
        "autoregressive video generation",
        "in-context learning",
        "streaming distillation",
        "gradient-reweighted distribution matching",
        "kv cache rescheduling",
        "garment switching"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "FashionChameleon：实时交互式人体服装视频定制",
      "summary_zh": "通过教师-学生蒸馏和上下文学习实现多服装实时交互视频定制，保持动作连贯性",
      "title_i18n": {
        "en": "FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization",
        "zh-CN": "FashionChameleon：实时交互式人体服装视频定制",
        "ja": "FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization",
        "ko": "FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization",
        "es": "FashionChameleon: Hacia la Personalización en Tiempo Real y Interactiva de Videoconferencias con Ropa",
        "de": "FashionChameleon: Towards Real-Time and Interactive Human-Garment Video Customization"
      },
      "summary_i18n": {
        "en": "FashionChameleon enables real-time interactive multi-garment video customization through teacher-student distillation and in-context learning techniques while maintaining motion coherence.",
        "zh-CN": "通过教师-学生蒸馏和上下文学习实现多服装实时交互视频定制，保持动作连贯性",
        "ja": "FashionChameleonは、教師-生徒の蒸留と文脈内学習技術を通じて、リアルタイムでインタラクティブなマルチガーメント動画カスタマイズを可能にし、動きの整合性を維持する。",
        "ko": "FashionChameleon은 움직임 일관성을 유지하면서 교사-학생 소프트웨어 및 맥락 내 학습 기법을 통해 실시간 인터랙티브 다중 의류 비디오 커스터마이징을 가능하게 합니다.",
        "es": "FashionChameleon permite personalización interactiva en tiempo real de múltiples prendas mediante técnicas de distilación maestro-estudiante y aprendizaje en contexto mientras se mantiene la coherencia del movimiento.",
        "de": "FashionChameleon ermöglicht Echtzeit-Interaktivität bei der Multi-Garment-Videobearbeitung durch Lehrer-Student-Distillation und Kontextlernverfahren."
      }
    },
    {
      "arxivId": "2605.13565",
      "title": "Qwen-Image-VAE-2.0 Technical Report",
      "summary": "Qwen-Image-VAE-2.0 is a high-compression Variational Autoencoder suite that improves reconstruction fidelity and diffusability through enhanced architecture, large-scale training, and semantic alignment strategies.",
      "authors": [
        "Zekai Zhang",
        "Deqing Li",
        "Kuan Cao",
        "Yujia Wu",
        "Chenfei Wu",
        "Yu Wu"
      ],
      "organization": {
        "_id": "64c8b5837fe12ecd0a7e92eb",
        "name": "Qwen",
        "fullname": "Qwen",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 57,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13565.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13565",
      "pdfUrl": "https://arxiv.org/pdf/2605.13565.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13565",
      "githubRepo": "https://github.com/alibaba/OmniDoc-TokenBench",
      "githubStars": 53,
      "keywords": [
        "Variational Autoencoders",
        "Global Skip Connections",
        "latent channels",
        "synthetic rendering engine",
        "semantic alignment",
        "asymmetric encoder-decoder"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Qwen-Image-VAE-2.0 技术报告",
      "summary_zh": "改进架构与语义对齐，提升重建精度与扩散性能",
      "title_i18n": {
        "en": "Qwen-Image-VAE-2.0 Technical Report",
        "zh-CN": "Qwen-Image-VAE-2.0 技术报告",
        "ja": "Qwen-Image-VAE-2.0 Technical Report",
        "ko": "Qwen-Image-VAE-2.0 Technical Report",
        "es": "Informe Técnico de Qwen-Image-VAE-2.0",
        "de": "Qwen-Image-VAE-2.0 Technical Report"
      },
      "summary_i18n": {
        "en": "Qwen-Image-VAE-2.0 is a high-compression Variational Autoencoder suite that improves reconstruction fidelity and diffusability through enhanced architecture, large-scale training, and semantic alignment strategies.",
        "zh-CN": "改进架构与语义对齐，提升重建精度与扩散性能",
        "ja": "Qwen-Image-VAE-2.0は、拡張されたアーキテクチャ、大規模なトレーニング、意味の整合戦略を通じて、再構築の正確性と拡散性を向上させる高圧縮変分オートエンコーダーのサセットである。",
        "ko": "Qwen-Image-VAE-2.0는 고압축 변분 자동인코더 세트로, 강화된 아키텍처와 대규모 학습을 통해 재구성 정확도와 확산성을 개선합니다.",
        "es": "Qwen-Image-VAE-2.0 es un conjunto de Autoencoders Variacionales de alta compresión que mejora la fidelidad de reconstrucción y difusibilidad mediante arquitectura mejorada, entrenamiento a gran escala y estrategias de alineación semántica.",
        "de": "Qwen-Image-VAE-2.0 ist eine Hochkompression-Variational-Autoencoder-Suite, die durch erweiterte Architektur, großflächiges Training und semantische Ausrichtung die Rekonstruktionsqualität verbessert."
      }
    },
    {
      "arxivId": "2605.18451",
      "title": "Code-as-Room: Generating 3D Rooms from Top-Down View Images via Agentic Code Synthesis",
      "summary": "A novel MLLM-based agentic framework called Code-as-Room generates 3D indoor rooms by converting top-down images into executable Blender code through a structured execution harness with cross-stage memory to maintain context.",
      "authors": [
        "Yixuan Yang",
        "Zhen Luo",
        "Wanshui Gan",
        "Jinkun Hao",
        "Junru Lu",
        "Jinghao Yan"
      ],
      "organization": {
        "_id": "6747ee5decec679eafb90450",
        "name": "ShanghaiAiLab",
        "fullname": "shanghai ailab ",
        "avatar": "https://www.gravatar.com/avatar/6cd2acf412ad103653d9ce14a1aacc19?d=retro&size=100"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 37,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18451.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18451",
      "pdfUrl": "https://arxiv.org/pdf/2605.18451.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18451",
      "githubRepo": "https://github.com/YxuanAr/Code-as-Room",
      "githubStars": 32,
      "keywords": [
        "MLLM-based agentic framework",
        "structured execution harness",
        "Blender codes",
        "cross-stage memory",
        "3D room synthesis",
        "top-down views"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Code-as-Room：通过代理代码生成3D房间",
      "summary_zh": "基于MLLM的代理框架Code-as-Room将俯视图转换为可执行Blender代码生成3D室内房间。",
      "title_i18n": {
        "en": "Code-as-Room: Generating 3D Rooms from Top-Down View Images via Agentic Code Synthesis",
        "zh-CN": "Code-as-Room：通过代理代码生成3D房间",
        "ja": "Code-as-Room: Top-Down View Images から 3D ルームを生成するエージェントコード合成を通じた",
        "ko": "Code-as-Room: Generating 3D Rooms from Top-Down View Images via Agentic Code Synthesis",
        "es": "Code-as-Room: Generando habitaciones 3D a partir de imágenes de vista superior mediante síntesis de código agente",
        "de": "Code-as-Room: Generieren von 3D-Räumen aus Top-Down-Bildern durch agen-tische Code-Synthese"
      },
      "summary_i18n": {
        "en": "A novel MLLM-based agentic framework called Code-as-Room generates 3D indoor rooms by converting top-down images into executable Blender code through a structured execution harness with cross-stage memory to maintain context.",
        "zh-CN": "基于MLLM的代理框架Code-as-Room将俯视图转换为可执行Blender代码生成3D室内房间。",
        "ja": "Code-as-Room は、コンテキストを維持するためのクロスステージメモリを備えた構造化実行ハーネスを通じて、トップダウン画像を実行可能な Blender コードに変換して 3D 室を生成する新しい MLLM ベースのエージェントフレームワークです。",
        "ko": "Code-as-Room은 상단 시야 이미지를 실행 가능한 Blender 코드로 변환하여 3D 실내 공간을 생성하는 새로운 MLLM 기반 에이전트 프레임워크입니다.",
        "es": "Un marco basado en MLLM llamado Code-as-Room genera habitaciones 3D convirtiendo imágenes de vista superior en código de Blender ejecutable mediante un sistema estructurado con memoria cruzada.",
        "de": "Ein neues MLLM-basiertes agenzisches Framework namens Code-as-Room generiert 3D-Innenräume, indem es Top-Down-Bilder in ausführbaren Blender-Code umwandelt."
      }
    },
    {
      "arxivId": "2605.14906",
      "title": "MemLens: Benchmarking Multimodal Long-Term Memory in Large Vision-Language Models",
      "summary": "A new benchmark evaluates memory capabilities in vision-language models through multi-session conversations, revealing limitations of both long-context and memory-augmented approaches.",
      "authors": [
        "Xiyu Ren",
        "Zhaowei Wang",
        "Yiming Du",
        "Zhongwei Xie",
        "Chi Liu",
        "Xinlin Yang"
      ],
      "organization": {
        "_id": "60262b67268c201cdc8b7d43",
        "name": "nvidia",
        "fullname": "NVIDIA",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65df9200dc3292a8983e5017/Vs5FPVCH-VZBipV3qKTuy.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 71,
      "comments": 5,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14906.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14906",
      "pdfUrl": "https://arxiv.org/pdf/2605.14906.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14906",
      "githubRepo": "https://github.com/xrenaf/MEMLENS",
      "githubStars": 17,
      "keywords": [
        "vision-language models",
        "long-context LVLMs",
        "memory-augmented agents",
        "multimodal multi-session conversations",
        "memory abilities",
        "cross-modal token-counting"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "MemLens：大型多模态视觉语言模型的长期记忆基准",
      "summary_zh": "构建多轮对话基准评估视觉语言模型的记忆能力，揭示长上下文与记忆增强方法的局限性。",
      "title_i18n": {
        "en": "MemLens: Benchmarking Multimodal Long-Term Memory in Large Vision-Language Models",
        "zh-CN": "MemLens：大型多模态视觉语言模型的长期记忆基准",
        "ja": "MemLens: 大規模なビジョン・言語モデルにおけるマルチモーダル長期記憶のベンチマーキング",
        "ko": "MemLens: Benchmarking Multimodal Long-Term Memory in Large Vision-Language Models",
        "es": "MemLens: Benchmarking Multimodal Long-Term Memory in Large Vision-Language Models",
        "de": "MemLens: Benchmarking multimodaler Langzeitgedächtnisse in großen Vision-Language-Modellen"
      },
      "summary_i18n": {
        "en": "A new benchmark evaluates memory capabilities in vision-language models through multi-session conversations, revealing limitations of both long-context and memory-augmented approaches.",
        "zh-CN": "构建多轮对话基准评估视觉语言模型的记忆能力，揭示长上下文与记忆增强方法的局限性。",
        "ja": "マルチセッション会話を通じてビジュアル言語モデルの記憶能力を評価する新しいベンチマーキングが、長文脈および記憶拡張アプローチの限界を明らかにしました。",
        "ko": "MemLens은 다중 세션 대화를 통해 비전-언어 모델의 기억 능력을 평가하며, 장기적 맥락 및 메모리 증강 접근법의 한계를 드러냅니다.",
        "es": "Un nuevo benchmark evalúa las capacidades de memoria en modelos vision-lenguaje mediante conversaciones multisesión, revelando limitaciones de enfoques de largo contexto y con memoria.",
        "de": "Ein neuer Benchmark bewertet Gedächtnisfähigkeiten in Vision-Language-Modellen über mehrere Sitzungen hinweg, wobei Grenzen bei langfristigen Kontext- und Gedächtnisverstärkungsansätzen aufgedeckt werden."
      }
    },
    {
      "arxivId": "2605.15128",
      "title": "MemEye: A Visual-Centric Evaluation Framework for Multimodal Agent Memory",
      "summary": "MemEye framework evaluates multimodal agent memory by measuring visual evidence granularity and retrieval usage complexity across 8 life-scenario tasks.",
      "authors": [
        "Minghao Guo",
        "Qingyue Jiao",
        "Zeru Shi",
        "Yihao Quan",
        "Boxuan Zhang",
        "Danrui Li"
      ],
      "organization": null,
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 60,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15128.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15128",
      "pdfUrl": "https://arxiv.org/pdf/2605.15128.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15128",
      "githubRepo": "https://github.com/MinghoKwok/MemEye",
      "githubStars": 30,
      "keywords": [
        "multimodal memory",
        "visual evidence",
        "memory evaluation",
        "VLM backbones",
        "evidence routing",
        "temporal tracking"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "MemEye：多模态代理记忆的视觉评估框架",
      "summary_zh": "MemEye通过测量视觉证据粒度和检索复杂性评估多模态代理记忆",
      "title_i18n": {
        "en": "MemEye: A Visual-Centric Evaluation Framework for Multimodal Agent Memory",
        "zh-CN": "MemEye：多模态代理记忆的视觉评估框架",
        "ja": "MemEye: マルチモーダルエージェント記憶の視覚中心評価フレームワーク",
        "ko": "MemEye: A Visual-Centric Evaluation Framework for Multimodal Agent Memory",
        "es": "MemEye: Un marco de evaluación centrado en la visión para la memoria de agentes multimodales",
        "de": "MemEye: Ein visuell zentriertes Bewertungsrahmenwerk für multimodale Agentengedächtnisse"
      },
      "summary_i18n": {
        "en": "MemEye framework evaluates multimodal agent memory by measuring visual evidence granularity and retrieval usage complexity across 8 life-scenario tasks.",
        "zh-CN": "MemEye通过测量视觉证据粒度和检索复杂性评估多模态代理记忆",
        "ja": "MemEye フレームワークは、8つのライフシナリオタスクで視覚的証拠の粒度と検索使用の複雑さを測定することで、マルチモーダルエージェント記憶を評価します。",
        "ko": "MemEye는 8개의 생활 시나리오 작업에서 시각적 증거의 세부성과 검색 사용 복잡성을 측정하여 다중모달 에이전트 메모리를 평가합니다.",
        "es": "El marco MemEye evalúa la memoria de agentes multimodales midiendo la granularidad de evidencia visual y la complejidad de uso de recuperación en 8 tareas de escenarios de vida.",
        "de": "Das MemEye-Rahmenwerk bewertet multimodales Agentengedächtnis anhand der Granularität visueller Beweise und der Komplexität der Abfrageverwendung."
      }
    },
    {
      "arxivId": "2605.10616",
      "title": "MulTaBench: Benchmarking Multimodal Tabular Learning with Text and Image",
      "summary": "Multimodal tabular learning benchmarks reveal that task-specific embedding tuning improves performance over frozen pretrained embeddings, particularly when modalities provide complementary predictive signals.",
      "authors": [
        "Alan Arazi",
        "Eilam Shapira",
        "Shoham Grunblat",
        "Mor Ventura",
        "Elad Hoffer",
        "Gioia Blayer"
      ],
      "organization": {
        "_id": "6393322be2364bc1eea56e45",
        "name": "Technion",
        "fullname": "Technion Israel institute of technology",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/1670591001944-63926124526c29d5b5011374.jpeg"
      },
      "publishedAt": "2026-05-11T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 137,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.10616.png",
      "arxivUrl": "https://arxiv.org/abs/2605.10616",
      "pdfUrl": "https://arxiv.org/pdf/2605.10616.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.10616",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "tabular foundation models",
        "multimodal tabular learning",
        "pretrained embeddings",
        "target-aware representations",
        "MulTaBench",
        "predictive tasks"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "MulTaBench：基于文本和图像的多模态表格学习基准",
      "summary_zh": "通过任务特定嵌入微调提升多模态表格学习性能，尤其在模态提供互补预测信号时效果显著。",
      "title_i18n": {
        "en": "MulTaBench: Benchmarking Multimodal Tabular Learning with Text and Image",
        "zh-CN": "MulTaBench：基于文本和图像的多模态表格学习基准",
        "ja": "MulTaBench: テキストと画像を用いたマルチモーダルテーブル学習のベンチマーキング",
        "ko": "MulTaBench: Benchmarking Multimodal Tabular Learning with Text and Image",
        "es": "MulTaBench: Benchmarking Multimodal Tabular Learning with Text and Image",
        "de": "MulTaBench: Benchmarking multimodaler Tabellenlernens mit Text und Bild"
      },
      "summary_i18n": {
        "en": "Multimodal tabular learning benchmarks reveal that task-specific embedding tuning improves performance over frozen pretrained embeddings, particularly when modalities provide complementary predictive signals.",
        "zh-CN": "通过任务特定嵌入微调提升多模态表格学习性能，尤其在模态提供互补预测信号时效果显著。",
        "ja": "マルチモーダルテーブル学習のベンチマーキングにより、タスク固有の埋め込みチューニングが、特にモダリティが補完的な予測信号を提供する場合に、事前学習された埋め込みよりもパフォーマンスを向上させることを示しました。",
        "ko": "다중모달 표 형식 학습 벤치마크는 태스크 특화 임베딩 튜닝이 예측 신호가 보완적인 경우 성능 향상에 효과적임을 보여줍니다.",
        "es": "Los benchmarks de aprendizaje tabular multimodal muestran que el ajuste de embeddings específicos mejora el rendimiento sobre embeddings preentrenados congelados.",
        "de": "Multimodale Tabellenlernbenchmarks zeigen, dass tiefes Anpassen von Embeddings die Leistung steigert, besonders wenn Modality komplementäre Vorhersagezeichen bereitstellt."
      }
    },
    {
      "arxivId": "2605.15182",
      "title": "Warp-as-History: Generalizable Camera-Controlled Video Generation from One Training Video",
      "summary": "A novel approach called Warp-as-History enables camera-controlled video generation by transforming camera-induced warps into pseudo-history representations, achieving zero-shot capability without training or test-time optimization.",
      "authors": [
        "Yifan Wang",
        "Tong He"
      ],
      "organization": null,
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 38,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15182.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15182",
      "pdfUrl": "https://arxiv.org/pdf/2605.15182.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15182",
      "githubRepo": "https://github.com/yyfz/Warp-as-History",
      "githubStars": 154,
      "keywords": [
        "camera-induced warps",
        "camera-warped pseudo-history",
        "target-frame positional alignment",
        "visible-token selection",
        "visual-history pathway",
        "positional encoding"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Warp-as-History：从单个训练视频生成可控制相机的视频",
      "summary_zh": "通过将相机引起的形变转换为伪历史表示，实现无需训练或测试时优化的零样本相机控制视频生成",
      "title_i18n": {
        "en": "Warp-as-History: Generalizable Camera-Controlled Video Generation from One Training Video",
        "zh-CN": "Warp-as-History：从单个训练视频生成可控制相机的视频",
        "ja": "Warp-as-History: 1本のトレーニング動画からの汎用的なカメラ制御動画生成",
        "ko": "Warp-as-History: Generalizable Camera-Controlled Video Generation from One Training Video",
        "es": "Warp-as-History: Generación de videos controlados por cámara generalizable a partir de un solo video de entrenamiento",
        "de": "Warp-as-History: Allgemeine Kamerasteuerung von Videos aus einem Trainingsvideo"
      },
      "summary_i18n": {
        "en": "A novel approach called Warp-as-History enables camera-controlled video generation by transforming camera-induced warps into pseudo-history representations, achieving zero-shot capability without training or test-time optimization.",
        "zh-CN": "通过将相机引起的形变转换为伪历史表示，实现无需训练或测试时优化的零样本相机控制视频生成",
        "ja": "Warp-as-History は、カメラ誘発のワープを疑似履歴表現に変換することで、トレーニングやテスト時の最適化なしにゼロショットのカメラ制御動画生成を可能にする新しいアプローチです。",
        "ko": "Warp-as-History는 카메라 유도 왜곡을 가상 역사 표현으로 변환하여 하나의 학습 동영상으로부터 일반화된 카메라 제어 동영상 생성을 가능하게 합니다.",
        "es": "Un enfoque nuevo llamado Warp-as-History permite generar videos controlados por cámara transformando distorsiones inducidas por la cámara en representaciones pseudo-históricas.",
        "de": "Ein neuer Ansatz namens Warp-as-History ermöglicht kameresteuerte Videoerzeugung durch Umwandlung von Kamera-Warps in Pseudogeschichtsrepräsentationen."
      }
    },
    {
      "arxivId": "2605.16257",
      "title": "DexJoCo: A Benchmark and Toolkit for Task-Oriented Dexterous Manipulation on MuJoCo",
      "summary": "DexJoCo presents a benchmark and toolkit for dexterous manipulation with 11 functional tasks evaluating tool-use, bimanual coordination, and long-horizon execution, along with a low-cost data collection system and comprehensive model evaluation.",
      "authors": [
        "Hanwen Wang",
        "Weizhi Zhao",
        "Xiangyu Wang",
        "Siyuan Huang",
        "He Lin",
        "Boyuan Zheng"
      ],
      "organization": null,
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 48,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.16257.png",
      "arxivUrl": "https://arxiv.org/abs/2605.16257",
      "pdfUrl": "https://arxiv.org/pdf/2605.16257.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.16257",
      "githubRepo": "https://github.com/brave-eai/dexjoco",
      "githubStars": 50,
      "keywords": [
        "dexterous manipulation",
        "benchmark",
        "toolkit",
        "functionally grounded tasks",
        "tool-use",
        "bimanual coordination"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "DexJoCo：MuJoCo上的任务导向灵巧操作基准与工具包",
      "summary_zh": "构建DexJoCo基准与工具包，包含11个功能任务，评估工具使用、双臂协调和长时序执行。",
      "title_i18n": {
        "en": "DexJoCo: A Benchmark and Toolkit for Task-Oriented Dexterous Manipulation on MuJoCo",
        "zh-CN": "DexJoCo：MuJoCo上的任务导向灵巧操作基准与工具包",
        "ja": "DexJoCo: MuJoCo 上でのタスク指向型繊細な操作のベンチマーキングとツールキット",
        "ko": "DexJoCo: A Benchmark and Toolkit for Task-Oriented Dexterous Manipulation on MuJoCo",
        "es": "DexJoCo: Un benchmark y herramienta para manipulación hábil orientada a tareas en MuJoCo",
        "de": "DexJoCo: Ein Benchmark und Toolkit für tätigkeitsorientierte geschickte Manipulation auf MuJoCo"
      },
      "summary_i18n": {
        "en": "DexJoCo presents a benchmark and toolkit for dexterous manipulation with 11 functional tasks evaluating tool-use, bimanual coordination, and long-horizon execution, along with a low-cost data collection system and comprehensive model evaluation.",
        "zh-CN": "构建DexJoCo基准与工具包，包含11个功能任务，评估工具使用、双臂协调和长时序执行。",
        "ja": "DexJoCo は、ツール使用、両手協調、長期実行を評価する 11 の機能的なタスクを含む、繊細な操作のベンチマーキングとツールキットを提示し、低コストのデータ収集システムと包括的なモデル評価を備えています。",
        "ko": "DexJoCo는 도구 사용, 이완 조절, 장기 실행을 평가하는 11개의 기능적 작업을 제공하는 벤치마크와 툴킷입니다.",
        "es": "DexJoCo presenta un benchmark y herramienta para manipulación hábil con 11 tareas funcionales que evalúan el uso de herramientas, coordinación bimanual y ejecución a largo plazo.",
        "de": "DexJoCo präsentiert einen Benchmark und ein Toolkit für geschickte Manipulation mit 11 Funktionstasks, einschließlich Werkzeugnutzung und bimanueller Koordination."
      }
    },
    {
      "arxivId": "2605.16679",
      "title": "CHI-Bench: Can AI Agents Automate End-to-End, Long-Horizon, Policy-Rich Healthcare Workflows?",
      "summary": "Healthcare workflow benchmark challenges agents with policy-dense, multi-role, and multilateral interaction requirements, revealing significant performance gaps in automated enterprise applications.",
      "authors": [
        "Haolin Chen",
        "Deon Metelski",
        "Leon Qi",
        "Tao Xia",
        "Joonyul Lee",
        "Steve Brown"
      ],
      "organization": {
        "_id": "68edc38fec75faa72a18d292",
        "name": "actava",
        "fullname": "actAVA AI",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/68edc310a4f606a8123967e7/yn4FgSauqB_0VC1xYOgOf.png"
      },
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 43,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.16679.png",
      "arxivUrl": "https://arxiv.org/abs/2605.16679",
      "pdfUrl": "https://arxiv.org/pdf/2605.16679.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.16679",
      "githubRepo": "https://github.com/actava-ai/chi-bench",
      "githubStars": 8,
      "keywords": [
        "healthcare operations",
        "policy density",
        "multi-role composition",
        "multilateral interaction",
        "long-horizon workflows",
        "clinical cases"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "CHI-Bench：AI代理能否自动化长周期多策略医疗流程？",
      "summary_zh": "构建CHI-Bench基准测试，评估AI代理在多角色、多边交互的医疗流程中的自动化能力。",
      "title_i18n": {
        "en": "CHI-Bench: Can AI Agents Automate End-to-End, Long-Horizon, Policy-Rich Healthcare Workflows?",
        "zh-CN": "CHI-Bench：AI代理能否自动化长周期多策略医疗流程？",
        "ja": "CHI-Bench: AI エージェントはエンドツーエンド、長期、ポリシー豊富な医療ワークフローを自動化できるか?",
        "ko": "CHI-Bench: Can AI Agents Automate End-to-End, Long-Horizon, Policy-Rich Healthcare Workflows?",
        "es": "CHI-Bench: ¿Pueden los agentes de IA automatizar flujos de trabajo sanitarios end-to-end, de largo horizonte y ricos en políticas?",
        "de": "CHI-Bench: Kann AI-Agenten End-to-End, langfristige, politikreiche Gesundheitsworkflow automatisieren?"
      },
      "summary_i18n": {
        "en": "Healthcare workflow benchmark challenges agents with policy-dense, multi-role, and multilateral interaction requirements, revealing significant performance gaps in automated enterprise applications.",
        "zh-CN": "构建CHI-Bench基准测试，评估AI代理在多角色、多边交互的医疗流程中的自动化能力。",
        "ja": "医療ワークフローのベンチマーキングは、エージェントにポリシー密集型、多役割、多側面の相互作用要件を課し、自動化企業アプリケーションにおける顕著なパフォーマンスギャップを明らかにしました。",
        "ko": "CHI-Bench는 정책 밀도 높은, 다역할 및 다자 상호작용 요구사항을 가진 의료 워크플로우를 에이전트에게 도전시켜 자동화 기업 응용 프로그램의 성능 격차를 드러냅니다.",
        "es": "El benchmark de flujo de trabajo sanitario desafía a los agentes con requisitos de interacción política densa, multirrol y multilaterales, revelando brechas significativas.",
        "de": "Der Healthcare-Workflow-Benchmark stellt Agenten mit politikdichten, mehreren Rollen und multilateralen Interaktionsanforderungen vor, wobei Leistungslücken in automatisierten Anwendungen auftreten."
      }
    },
    {
      "arxivId": "2605.14271",
      "title": "Auditing Agent Harness Safety",
      "summary": "LLM agents executing within execution harnesses can produce correct outputs while violating safety constraints during execution, necessitating trajectory-level auditing to ensure proper resource access and information flow across multi-agent systems.",
      "authors": [
        "Chengzhi Liu",
        "Yichen Guo",
        "Yepeng Liu",
        "Yuzhe Yang",
        "Qianqi Yan",
        "Xuandong Zhao"
      ],
      "organization": {
        "_id": "65861edfe3f7a2dcf04230f8",
        "name": "ucsbnlp",
        "fullname": "UC Santa Barbara NLP Group",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6002c1db698168af3bb9f4a5/WQYUIGXIycUiVr_J5X2n0.jpeg"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 49,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14271.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14271",
      "pdfUrl": "https://arxiv.org/pdf/2605.14271.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14271",
      "githubRepo": "https://github.com/eric-ai-lab/HarnessAudit",
      "githubStars": 29,
      "keywords": [
        "execution harnesses",
        "tool dispatching",
        "resource allocation",
        "multi-agent systems",
        "safety benchmarks",
        "trajectory auditing"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "审计代理执行安全",
      "summary_zh": "通过轨迹级审计确保多代理系统中资源访问与信息流的安全性",
      "title_i18n": {
        "en": "Auditing Agent Harness Safety",
        "zh-CN": "审计代理执行安全",
        "ja": "エージェントハーネスの安全性の監査",
        "ko": "Auditing Agent Harness Safety",
        "es": "Auditoría de seguridad del entorno de agentes",
        "de": "Auditing Agent Harness Safety"
      },
      "summary_i18n": {
        "en": "LLM agents executing within execution harnesses can produce correct outputs while violating safety constraints during execution, necessitating trajectory-level auditing to ensure proper resource access and information flow across multi-agent systems.",
        "zh-CN": "通过轨迹级审计确保多代理系统中资源访问与信息流的安全性",
        "ja": "実行ハーネス内で実行される LLM エージェントは、出力が正しいものの、実行中に安全制約に違反することがあり、マルチエージェントシステムにおける適切なリソースアクセスと情報フローを確保するためにトラジェクトリーレベルの監査が必要です。",
        "ko": "실행 하arness 내에서 작동하는 LLM 에이전트는 실행 중 안전 제약을 위반할 수 있으므로, 다에이전트 시스템에서 리소스 접근 및 정보 흐름을 보장하기 위해 경로 수준 감사가 필요합니다.",
        "es": "Los agentes LLM que operan dentro de entornos de ejecución pueden producir salidas correctas mientras violan restricciones de seguridad durante la ejecución.",
        "de": "LLM-Agente können korrekte Ausgaben liefern, während sie während der Ausführung Sicherheitsbeschränkungen verletzen, was eine Auditing auf Trajektorieebene erfordert."
      }
    },
    {
      "arxivId": "2605.14278",
      "title": "KVPO: ODE-Native GRPO for Autoregressive Video Alignment via KV Semantic Exploration",
      "summary": "ODENative online GRPO framework KVPO aligns streaming video generators with human preferences through causal-semantic exploration and velocity-field surrogate policy based on trajectory velocity energy.",
      "authors": [
        "Ruicheng Zhang",
        "Kaixi Cong",
        "Jun Zhou",
        "Zhizhou Zhong",
        "Zunnan Xu",
        "Shuiyang Mao"
      ],
      "organization": {
        "_id": "64cc8e9b214a472dd85e7e1d",
        "name": "THU1911",
        "fullname": "Tsinghua University",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/61f8e5934a8e5a275b2b3e5a/oKO6FK_rTzzPHXihicZou.jpeg"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 36,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14278.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14278",
      "pdfUrl": "https://arxiv.org/pdf/2605.14278.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14278",
      "githubRepo": "https://github.com/Richard-Zhang-AI/KVPO",
      "githubStars": 12,
      "keywords": [
        "streaming autoregressive video generators",
        "reinforcement learning",
        "noise-based exploration",
        "SDE-based surrogate policies",
        "ODE dynamics",
        "distilled AR models"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "KVPO：基于KV语义探索的ODE原生GRPO方法",
      "summary_zh": "提出KVPO方法，通过因果语义探索和速度场代理策略实现视频生成与人类偏好的对齐",
      "title_i18n": {
        "en": "KVPO: ODE-Native GRPO for Autoregressive Video Alignment via KV Semantic Exploration",
        "zh-CN": "KVPO：基于KV语义探索的ODE原生GRPO方法",
        "ja": "KVPO: ODE-Native GRPO for Autoregressive Video Alignment via KV Semantic Exploration",
        "ko": "KVPO: ODE-Native GRPO for Autoregressive Video Alignment via KV Semantic Exploration",
        "es": "KVPO: ODE-Native GRPO para alineación de video autoregresivo mediante exploración semántica KV",
        "de": "KVPO: ODE-Native GRPO für autoregressive Video-Alignment durch KV-Semantik-Exploration"
      },
      "summary_i18n": {
        "en": "ODENative online GRPO framework KVPO aligns streaming video generators with human preferences through causal-semantic exploration and velocity-field surrogate policy based on trajectory velocity energy.",
        "zh-CN": "提出KVPO方法，通过因果语义探索和速度场代理策略实现视频生成与人类偏好的对齐",
        "ja": "ODENativeオンラインGRPOフレームワークKVPOは、因果的意味探索と軌道速度エネルギーに基づく速度場代替ポリシーを通じて、ストリーミング動画ジェネレーターを人間の好みに合わせます。",
        "ko": "ODENative 온라인 GRPO 프레임워크 KVPO는 인과-세미틱 탐색 및 궤적 속도 에너지 기반의 속도장 대체 정책을 통해 스트리밍 비디오 생성기를 인간 선호와 맞춥니다.",
        "es": "El marco ODE-Native online GRPO KVPO alinea generadores de video en tiempo real con preferencias humanas mediante exploración causal-semántica y política de campo de velocidad basada en energía de trayectoria.",
        "de": "ODENative Online-GRPO-Framework KVPO aligniert Streaming-Video-Generatoren mit menschlichen Präferenzen durch kausale-Semantik-Exploration und Geschwindigkeitsfeld-Surrogate-Politik basierend auf Trajektorien-Energie."
      }
    },
    {
      "arxivId": "2605.14892",
      "title": "Beyond Individual Intelligence: Surveying Collaboration, Failure Attribution, and Self-Evolution in LLM-based Multi-Agent Systems",
      "summary": "Multi-agent systems face challenges in sustained coordination and error propagation, requiring integrated approaches that enable continuous diagnosis, reorganization, and behavioral refinement across structured collaboration stages.",
      "authors": [
        "Shihao Qi",
        "Jie Ma",
        "Rui Xing",
        "Wei Guo",
        "Xiao Huang",
        "Zhitao Gao"
      ],
      "organization": {
        "_id": "66a92d5a58cff488d93ab512",
        "name": "XianJiaotongUniversity",
        "fullname": "Xi'an Jiaotong University",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/66a92ba2f351acac61ba119c/6zLTkLwBLMbRLR1y7tfpC.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 46,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14892.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14892",
      "pdfUrl": "https://arxiv.org/pdf/2605.14892.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14892",
      "githubRepo": "https://github.com/mira-ai-lab/awesome-mas-life",
      "githubStars": 32,
      "keywords": [
        "multi-agent systems",
        "autonomous agents",
        "coordination",
        "error propagation",
        "self-improvement",
        "agent collaboration"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "超越个体智能：LLM多智能体系统的协作、失败归因与自我进化综述",
      "summary_zh": "研究多智能体系统在协作、失败归因和自我进化方面的挑战，提出基于LLM的集成方法实现持续诊断与行为优化。",
      "title_i18n": {
        "en": "Beyond Individual Intelligence: Surveying Collaboration, Failure Attribution, and Self-Evolution in LLM-based Multi-Agent Systems",
        "zh-CN": "超越个体智能：LLM多智能体系统的协作、失败归因与自我进化综述",
        "ja": "Beyond Individual Intelligence: Surveying Collaboration, Failure Attribution, and Self-Evolution in LLM-based Multi-Agent Systems",
        "ko": "Beyond Individual Intelligence: Surveying Collaboration, Failure Attribution, and Self-Evolution in LLM-based Multi-Agent Systems",
        "es": "Más allá de la inteligencia individual: Revisión de la colaboración, atribución de fallos y evolución propia en sistemas multiagente basados en LLM",
        "de": "Jenseits individueller Intelligenz: Eine Untersuchung zur Zusammenarbeit, Fehlerzuordnung und Selbstentwicklung in LLM-basierten Multi-Agenten-Systemen"
      },
      "summary_i18n": {
        "en": "Multi-agent systems face challenges in sustained coordination and error propagation, requiring integrated approaches that enable continuous diagnosis, reorganization, and behavioral refinement across structured collaboration stages.",
        "zh-CN": "研究多智能体系统在协作、失败归因和自我进化方面的挑战，提出基于LLM的集成方法实现持续诊断与行为优化。",
        "ja": "マルチエージェントシステムは持続的な調整とエラーの拡散に直面し、構造化された協力段階で継続的な診断と行動の改善を可能にする統合的なアプローチが必要です。",
        "ko": "다중 에이전트 시스템은 지속적인 협력과 오류 확산에 도전하며, 구조화된 협업 단계에서 지속적인 진단, 재구성 및 행동 개선을 가능하게 하는 통합 접근이 필요합니다.",
        "es": "Los sistemas multiagente enfrentan desafíos en coordinación sostenida y propagación de errores, requiriendo enfoques integrados que permitan diagnóstico continuo, reorganización y refinamiento comportamental en etapas de colaboración estructurada.",
        "de": "Multi-Agenten-Systeme stehen vor Herausforderungen bei nachhaltiger Koordination und Fehlerverbreitung, wozu integrierte Ansätze erforderlich sind, die kontinuierliche Diagnose, Umstrukturierung und Verhaltensverfeinerung über strukturierte Zusammenarbeitsphasen ermöglichen."
      }
    },
    {
      "arxivId": "2605.12587",
      "title": "TrackCraft3R: Repurposing Video Diffusion Transformers for Dense 3D Tracking",
      "summary": "TrackCraft3R enables efficient dense 3D tracking from monocular video by adapting video diffusion transformers to follow physical points across frames using dual-latent representation and temporal RoPE alignment.",
      "authors": [
        "Jisu Nam",
        "Jahyeok Koo",
        "Soowon Son",
        "Jaewoo Jung",
        "Honggyu An",
        "Junhwa Hur"
      ],
      "organization": {
        "_id": "5e6aca39878b8b2bf9806447",
        "name": "google",
        "fullname": "Google",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/5dd96eb166059660ed1ee413/WtA3YYitedOr9n02eHfJe.png"
      },
      "publishedAt": "2026-05-12T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 36,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.12587.png",
      "arxivUrl": "https://arxiv.org/abs/2605.12587",
      "pdfUrl": "https://arxiv.org/pdf/2605.12587.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.12587",
      "githubRepo": "https://github.com/cvlab-kaist/TrackCraft3r",
      "githubStars": 74,
      "keywords": [
        "video diffusion transformers",
        "video DiTs",
        "dense 3D tracking",
        "reference-anchored tracking",
        "per-frame geometry latents",
        "track latents"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "TrackCraft3R：将视频扩散Transformer用于密集3D跟踪",
      "summary_zh": "通过双潜在表示和时间RoPE对齐，将视频扩散Transformer用于单目视频的高效密集3D跟踪",
      "title_i18n": {
        "en": "TrackCraft3R: Repurposing Video Diffusion Transformers for Dense 3D Tracking",
        "zh-CN": "TrackCraft3R：将视频扩散Transformer用于密集3D跟踪",
        "ja": "TrackCraft3R: Repurposing Video Diffusion Transformers for Dense 3D Tracking",
        "ko": "TrackCraft3R: Repurposing Video Diffusion Transformers for Dense 3D Tracking",
        "es": "TrackCraft3R: Repurposing Video Diffusion Transformers para seguimiento 3D denso",
        "de": "TrackCraft3R: Wiederverwendung von Video-Diffusions-Transformern für dichte 3D-Verfolgung"
      },
      "summary_i18n": {
        "en": "TrackCraft3R enables efficient dense 3D tracking from monocular video by adapting video diffusion transformers to follow physical points across frames using dual-latent representation and temporal RoPE alignment.",
        "zh-CN": "通过双潜在表示和时间RoPE对齐，将视频扩散Transformer用于单目视频的高效密集3D跟踪",
        "ja": "TrackCraft3Rは、双対潜在表現と時間的RoPEアライメントを使用して、単眼動画から効率的な密な3Dトラッキングを実現します。",
        "ko": "TrackCraft3R은 이중 잠재 표현 및 시간 RoPE 정렬을 사용하여 단안 영상에서 물리적 점을 프레임 간에 따라잡는 비디오 분산 변환기를 재사용하여 효율적인 밀도 3D 추적을 가능하게 합니다.",
        "es": "TrackCraft3R permite un seguimiento 3D denso eficiente a partir de video monocular adaptando transformadores de difusión de video para seguir puntos físicos a través de cuadros usando representación dual-latente y alineación temporal RoPE.",
        "de": "TrackCraft3R ermöglicht effiziente dichte 3D-Verfolgung aus Monochrom-Video, indem Video-Diffusions-Transformer an physische Punkte über Frames anpassen, mit dualer Latent-Repräsentation und zeitlicher RoPE-Ausrichtung."
      }
    },
    {
      "arxivId": "2605.11739",
      "title": "Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation",
      "summary": "On-policy distillation efficiency arises from early establishment of stable update trajectories, with findings leading to a plug-and-play acceleration method achieving 3x training speedup.",
      "authors": [
        "Yuchen Cai",
        "Ding Cao",
        "Liang Lin",
        "Chunxi Luo",
        "Xin Xu",
        "Kai Yang"
      ],
      "organization": {
        "_id": "6645f953c39288df638dbdd5",
        "name": "Tencent-Hunyuan",
        "fullname": "Tencent Hunyuan",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/62d22496c58f969c152bcefd/woKSjt2wXvBNKussyYPsa.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 52,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.11739.png",
      "arxivUrl": "https://arxiv.org/abs/2605.11739",
      "pdfUrl": "https://arxiv.org/pdf/2605.11739.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.11739",
      "githubRepo": "https://github.com/caiyuchen-ustc/EffOPD",
      "githubStars": 16,
      "keywords": [
        "on-policy distillation",
        "post-training paradigm",
        "parameter-level mechanisms",
        "module-allocation level",
        "update-direction level",
        "low-rank concentration"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "学习预见：揭示在策略蒸馏中的效率提升机制",
      "summary_zh": "通过分析稳定更新轨迹的早期建立，提出一种3倍训练加速的即插即用方法",
      "title_i18n": {
        "en": "Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation",
        "zh-CN": "学习预见：揭示在策略蒸馏中的效率提升机制",
        "ja": "Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation",
        "ko": "Learning to Foresee: Unveiling the Unlocking Efficiency of On-Policy Distillation",
        "es": "Aprender a prever: Revelando la eficiencia del distillado en política",
        "de": "Lernen zu antizipieren: Aufdecken der Effizienz von On-Policy-Distillation"
      },
      "summary_i18n": {
        "en": "On-policy distillation efficiency arises from early establishment of stable update trajectories, with findings leading to a plug-and-play acceleration method achieving 3x training speedup.",
        "zh-CN": "通过分析稳定更新轨迹的早期建立，提出一种3倍训练加速的即插即用方法",
        "ja": "オンポリシードィスティルレーションの効率は、安定した更新トレースの早期確立から生じ、3倍のトレーニングスピードアップを達成する即插即用の加速方法をもたらします。",
        "ko": "온정책 증류 효율성은 초기 안정된 업데이트 경로의 설정에서 비롯되며, 이 발견은 3배의 학습 속도 향상이 가능한 플러그 앤 플레이 가속 방법으로 이어집니다.",
        "es": "La eficiencia del distillado en política surge de la establecimiento temprano de trayectorias de actualización estables, con hallazgos que llevan a un método de aceleración plug-and-play logrando un aumento de 3x en la velocidad de entrenamiento.",
        "de": "Die Effizienz der On-Policy-Distillation ergibt sich aus der frühen Stabilisierung von Update-Trajektorien, wodurch ein plug-and-play-Beschleunigungsmethode mit 3-facher Trainingsgeschwindigkeit entsteht."
      }
    },
    {
      "arxivId": "2605.12964",
      "title": "Asymmetric Flow Models",
      "summary": "Asymmetric Flow Modeling enables efficient high-dimensional flow-based generation by restricting noise prediction to low-rank subspaces while maintaining full-dimensional data prediction, achieving superior performance in pixel-space text-to-image generation through effective fine-tuning from latent models.",
      "authors": [
        "Hansheng Chen",
        "Jan Ackermann",
        "Minseo Kim",
        "Gordon Wetzstein",
        "Leonidas Guibas"
      ],
      "organization": {
        "_id": "672c672dcf09d152f4da04c4",
        "name": "StanfordUniversity",
        "fullname": "Stanford University",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/68e396f2b5bb631e9b2fac9a/vJI0POlzGMXL2878t1vz2.jpeg"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 21,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.12964.png",
      "arxivUrl": "https://arxiv.org/abs/2605.12964",
      "pdfUrl": "https://arxiv.org/pdf/2605.12964.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.12964",
      "githubRepo": "https://github.com/Lakonik/LakonLab",
      "githubStars": 383,
      "keywords": [
        "flow-based generation",
        "velocity prediction",
        "high-dimensional noise",
        "low-rank structure",
        "rank-asymmetric velocity parameterization",
        "diffusion models"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "非对称流模型",
      "summary_zh": "通过限制噪声预测到低秩子空间实现高效高维生成，提升像素空间文本到图像生成性能。",
      "title_i18n": {
        "en": "Asymmetric Flow Models",
        "zh-CN": "非对称流模型",
        "ja": "Asymmetric Flow Models",
        "ko": "Asymmetric Flow Models",
        "es": "Modelos de flujo asimétrico",
        "de": "Asymmetrische Flussmodelle"
      },
      "summary_i18n": {
        "en": "Asymmetric Flow Modeling enables efficient high-dimensional flow-based generation by restricting noise prediction to low-rank subspaces while maintaining full-dimensional data prediction, achieving superior performance in pixel-space text-to-image generation through effective fine-tuning from latent models.",
        "zh-CN": "通过限制噪声预测到低秩子空间实现高效高维生成，提升像素空间文本到图像生成性能。",
        "ja": "非対称フローモデルは、ノイズ予測を低ランク部分空間に制限しながら、フル次元データ予測を維持することで、高次元フローサポート生成を効率的に実現します。",
        "ko": "비대칭 흐름 모델링은 낮은 랭크 하위공간에만 노이즈 예측을 제한하면서도 전체 차원 데이터 예측을 유지함으로써 고차원 흐름 기반 생성을 효율적으로 수행합니다.",
        "es": "El modelado de flujo asimétrico permite una generación eficiente basada en flujo en dimensiones altas restringiendo la predicción de ruido a subespacios de rango bajo mientras se mantiene la predicción de datos de dimensión completa, logrando un mejor rendimiento en generación de imágenes desde texto en espacio de píxeles mediante ajuste fino efectivo desde modelos latentes.",
        "de": "Asymmetrisches Flussmodellierung ermöglicht effiziente hochdimensionale Fluss-basierte Generierung, indem Rauschvorhersage auf niedrigrangige Teilräume beschränkt wird, während vollständige Datenvorhersage gewahrt bleibt, mit besserer Leistung in Pixel-Raum Text-zu-Bild-Generation durch effektives Fine-Tuning von latenten Modellen."
      }
    },
    {
      "arxivId": "2605.18643",
      "title": "Post-Trained MoE Can Skip Half Experts via Self-Distillation",
      "summary": "Zero-Expert Self-Distillation Adaptation (ZEDA) enables efficient dynamic Mixture-of-Experts models by converting static models into adaptive ones with reduced computational costs and improved inference speed.",
      "authors": [
        "Xingtai Lv",
        "Li Sheng",
        "Kaiyan Zhang",
        "Yichen You",
        "Siyan Gao",
        "Xueheng Luo"
      ],
      "organization": {
        "_id": "64cc8e9b214a472dd85e7e1d",
        "name": "THU1911",
        "fullname": "Tsinghua University",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/61f8e5934a8e5a275b2b3e5a/oKO6FK_rTzzPHXihicZou.jpeg"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 27,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18643.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18643",
      "pdfUrl": "https://arxiv.org/pdf/2605.18643.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18643",
      "githubRepo": "https://github.com/TsinghuaC3I/ZEDA",
      "githubStars": 15,
      "keywords": [
        "Mixture-of-Experts",
        "sparse expert activation",
        "dynamic MoE",
        "self-distillation",
        "parameter-free zero-output experts",
        "group-level balancing loss"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "后训练MoE通过自蒸馏可跳过一半专家",
      "summary_zh": "通过自蒸馏方法将静态MoE模型转化为动态模型，降低计算成本并提升推理速度。",
      "title_i18n": {
        "en": "Post-Trained MoE Can Skip Half Experts via Self-Distillation",
        "zh-CN": "后训练MoE通过自蒸馏可跳过一半专家",
        "ja": "Post-Trained MoE Can Skip Half Experts via Self-Distillation",
        "ko": "Post-Trained MoE Can Skip Half Experts via Self-Distillation",
        "es": "MoE post-entrenado puede omitir la mitad de expertos mediante auto-distilación",
        "de": "Nachtrainierte MoE können die Hälfte der Experten überspringen durch Selbst-Distillation"
      },
      "summary_i18n": {
        "en": "Zero-Expert Self-Distillation Adaptation (ZEDA) enables efficient dynamic Mixture-of-Experts models by converting static models into adaptive ones with reduced computational costs and improved inference speed.",
        "zh-CN": "通过自蒸馏方法将静态MoE模型转化为动态模型，降低计算成本并提升推理速度。",
        "ja": "ゼロエキスパート自己ディスティル適応（ZEDA）は、静的モデルを動的Mixture-of-Expertsモデルに変換し、計算コストを削減し、推論速度を向上させることで、効率的な動的Mixture-of-Expertsモデルを可能にします。",
        "ko": "제로-에이전트 자기 증류 적응(ZEDA)은 계산 비용 감소 및 추론 속도 향상을 통해 정적 모델을 동적 Mixture-of-Experts 모델로 전환하는 효율적인 방법입니다.",
        "es": "La adaptación de auto-distilación de cero expertos (ZEDA) permite modelos dinámicos de mezcla de expertos eficientes convirtiendo modelos estáticos en adaptativos con costos computacionales reducidos y mayor velocidad de inferencia.",
        "de": "Zero-Expert Self-Distillation Adaptation (ZEDA) ermöglicht effiziente dynamische Mixture-of-Experts-Modelle, indem statische Modelle in adaptive Modelle umgewandelt werden mit reduzierten Rechenkosten und verbesserter Inferenzgeschwindigkeit."
      }
    },
    {
      "arxivId": "2605.20179v1",
      "title": "TIDE: Efficient and Lossless MoE Diffusion LLM Inference with I/O-aware Expert Offload",
      "summary": "Diffusion Large Language Models (dLLMs) have emerged as a competitive alternative to autoregressive (AR) models, offering better hardware utilization and bidirectional context through parallel block-level decoding. However, as dLLMs continue to scale up with mixture-of-experts (MoE) architectures, their deployment on resource-constrained devices remains an open challenge. Existing AR-based methods often incur either prohibitive I/O overhead or significant compute bottlenecks. In this work, we propose TIDE, a novel resource-efficient inference system that leverages the temporal stability of expert activations during the diffusion process within the block. Specifically, we leverage the temporal stability of expert activations during the diffusion process within the block and introduce an interval-based expert refresh strategy that updates the expert placement in an I/O-aware fashion. To ensure optimal performance, we formulate the inference scheduling as a mathematical programming problem, solving for the optimal interval that minimizes I/O traffic and CPU computation. Most importantly, TIDE is a lossless optimization that requires no model training, providing a \"free lunch\" acceleration for dLLM inference. In a single GPU-CPU system, we demonstrate that TIDE achieves up to 1.4$\\times$ and 1.5$\\times$ throughput improvements over prior baselines on LLaDA2.0-mini and LLaDA2.0-flash models, respectively.",
      "authors": [
        "Zhiben Chen",
        "Youpeng Zhao",
        "Yang Sui",
        "Jun Wang",
        "Yuzhang Shang"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T17:59:08Z",
      "submittedAt": "2026-05-19T17:59:08Z",
      "upvotes": 0,
      "comments": 0,
      "thumbnail": null,
      "arxivUrl": "https://arxiv.org/abs/2605.20179v1",
      "pdfUrl": "https://arxiv.org/pdf/2605.20179v1",
      "hfUrl": "https://arxiv.org/abs/2605.20179v1",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "cs.CL"
      ],
      "source": "arXiv cs.CL",
      "sourceType": "arxiv",
      "title_zh": "TIDE：基于I/O感知的专家卸载高效无损MoE扩散LLM推理",
      "summary_zh": "提出TIDE系统，利用扩散过程中的专家激活时间稳定性，通过区间刷新策略优化I/O和计算，实现无损失加速，提升1.4到1.5倍吞吐量。",
      "title_i18n": {
        "en": "TIDE: Efficient and Lossless MoE Diffusion LLM Inference with I/O-aware Expert Offload",
        "zh-CN": "TIDE：基于I/O感知的专家卸载高效无损MoE扩散LLM推理",
        "ja": "TIDE: Efficient and Lossless MoE Diffusion LLM Inference with I/O-aware Expert Offload",
        "ko": "TIDE: Efficient and Lossless MoE Diffusion LLM Inference with I/O-aware Expert Offload",
        "es": "TIDE: Inferencia eficiente y sin pérdida de MoE de LLM de difusión con carga I/O consciente",
        "de": "TIDE: Effizienter und verlustfreier MoE-Diffusions-LLM-Inferenz mit I/O-orientierter Experten-Übertragung"
      },
      "summary_i18n": {
        "en": "Diffusion Large Language Models (dLLMs) have emerged as a competitive alternative to autoregressive (AR) models, offering better hardware utilization and bidirectional context through parallel block-level decoding. However, as dLLMs continue to scale up with mixture-of-experts (MoE) architectures, their deployment on resource-constrained devices remains an open challenge. Existing AR-based methods often incur either prohibitive I/O overhead or significant compute bottlenecks. In this work, we propose TIDE, a novel resource-efficient inference system that leverages the temporal stability of expert activations during the diffusion process within the block. Specifically, we leverage the temporal stability of expert activations during the diffusion process within the block and introduce an interval-based expert refresh strategy that updates the expert placement in an I/O-aware fashion. To ensure optimal performance, we formulate the inference scheduling as a mathematical programming problem, solving for the optimal interval that minimizes I/O traffic and CPU computation. Most importantly, TIDE is a lossless optimization that requires no model training, providing a \"free lunch\" acceleration for dLLM inference. In a single GPU-CPU system, we demonstrate that TIDE achieves up to 1.4$\\times$ and 1.5$\\times$ throughput improvements over prior baselines on LLaDA2.0-mini and LLaDA2.0-flash models, respectively.",
        "zh-CN": "提出TIDE系统，利用扩散过程中的专家激活时间稳定性，通过区间刷新策略优化I/O和计算，实现无损失加速，提升1.4到1.5倍吞吐量。",
        "ja": "Diffusion Large Language Models (dLLMs)は、並列ブロックレベルデコードを通じて、より良いハードウェア利用と双方向的なコンテキストを提供するARモデルの競争的な代替として登場しました。しかし、dLLMsがMoEアーキテクチャとともに拡大し続ける中、リソース制約のあるデバイスでの展開は依然として課題です。既存のARベースの方法は、禁止的なI/Oオーバーヘッドまたは大きな計算ボトルネックのいずれかを引き起こします。本研究では、ブロック内の拡散プロセスにおけるエキスパート活性の時間的安定性を活用し、I/Oに配慮したインターバルベースのエキスパートリフレッシュ戦略を導入する新しいリソース効率の高いインファレンスシステムTIDEを提案します。最適なパフォーマンスを確保するために、インファレンススケジューリングを数学的プログラミング問題として定式化し、I/OトラフィックとCPU計算を最小化する最適なインターバルを求めます。最も重要なのは、TIDEはモデルトレーニングを必要としないロスレス最適化であり、dLLMインファレンスに「無料のランチ」加速を提供します。単一のGPU-CPUシステムにおいて、LLaDA2.0-miniおよびLLaDA2.0-flashモデルで、TIDEは従来のベースラインよりも最大1.4$\\times$および1.5$\\times$のスループット向上を示します。",
        "ko": "확산 대규모 언어 모델(dLLMs)은 자동 회귀(AR) 모델과 경쟁적이며, 병렬 블록 수준 디코딩을 통해 더 나은 하드웨어 활용과 양방향 맥락을 제공합니다. 그러나 dLLMs가 혼합 전문가(MoE) 아키텍처를 통해 계속 확장됨에 따라, 자원 제한 장치에서의 배포는 여전히 열린 문제입니다. 기존 AR 기반 방법은 금지할 수 없는 I/O 오버헤드 또는 중요한 계산 볼트넥을 유발합니다. 본 연구에서는 확산 과정 내 블록에서 전문가 활성화의 시간 안정성을 활용하는 새로운 자원 효율적인 추론 시스템 TIDE를 제안합니다. 특히, 우리는 확산 과정 내 블록에서 전문가 활성화의 시간 안정성을 활용하고, I/O 인식 방식으로 전문가 배치를 업데이트하는 간격 기반 전문가 리프레시 전략을 도입합니다. 최적 성능을 보장하기 위해 추론 스케줄링을 수학적 프로그래밍 문제로 형성하여 I/O 트래픽과 CPU 계산을 최소화하는 최적 간격을 해결합니다. 가장 중요한 것은 TIDE는 모델 학습 없이 손실 없는 최적화를 제공하며, dLLM 추론에 대한 \"무료 식사\" 가속을 제공합니다. 단일 GPU-CPU 시스템에서, 우리는 TIDE가 LLaDA2.0-mini 및 LLaDA2.0-flash 모델에서 이전 기준보다 각각 1.4$\\times$ 및 1.5$\\times$의 처리량 향상을 보여줍니다.",
        "es": "Los modelos de lenguaje grandes de difusión (dLLMs) han surgido como alternativa competitiva a los modelos autoregresivos (AR), ofreciendo mejor utilización de hardware y contexto bidireccional mediante decodificación paralela por bloques. Sin embargo, a medida que los dLLMs siguen creciendo con arquitecturas de mezcla de expertos (MoE), su implementación en dispositivos con recursos limitados sigue siendo un desafío abierto. Los métodos basados en AR existentes suelen incurrir en sobrecarga I/O prohibitiva o cuellos de botella computacionales significativos. En este trabajo, proponemos TIDE, un sistema de inferencia eficiente en recursos que aprovecha la estabilidad temporal de las activaciones de expertos durante el proceso de difusión dentro del bloque. Específicamente, aprovechamos la estabilidad temporal de las activaciones de expertos durante el proceso de difusión dentro del bloque e introducimos una estrategia de renovación de expertos basada en intervalos que actualiza la ubicación de expertos de manera consciente de I/O. Para garantizar un buen rendimiento, formulamos el plan de inferencia como un problema de programación matemática, resolviéndolo para encontrar el intervalo óptimo que minimice el tráfico I/O y la computación de CPU. Lo más importante es que TIDE es una optimización sin pérdida que no requiere entrenamiento del modelo, proporcionando una aceleración de \"comida gratis\" para la inferencia de dLLM. En un sistema único GPU-CPU, demostramos que TIDE logra mejoras de hasta 1.4$\\times$ y 1.5$\\times$ en throughput sobre bases anteriores en los modelos LLaDA2.0-mini y LLaDA2.0-flash, respectivamente.",
        "de": "Diffusions-Größere Sprachmodelle (dLLMs) haben sich als konkurrierende Alternative zu autoregressiven (AR) Modellen etabliert, mit besserer Hardware-Nutzung und bidirektionaler Kontext durch parallele Block-Entkodierung. Allerdings bleibt ihre Bereitstellung auf ressourcenbeschränkten Geräten eine offene Herausforderung. Bestehende AR-basierte Methoden verursachen oft prohibitiv hohe I/O-Overhead oder erhebliche Rechenengpässe. In dieser Arbeit schlagen wir TIDE vor, ein neues ressourceneffizientes Inferenzsystem, das die zeitliche Stabilität der Expertenaktivierungen während des Diffusionsprozesses innerhalb des Blocks nutzt. Insbesondere nutzen wir die zeitliche Stabilität der Expertenaktivierungen während des Diffusionsprozesses innerhalb des Blocks und führen eine intervallbasierte Experten-Refresh-Strategie ein, die die Expertenplatzierung I/O-orientiert aktualisiert. Um optimale Leistung sicherzustellen, formulieren wir die Inferenzplanung als mathematisches Programmierungsproblem und lösen nach dem optimalen Intervall, das I/O-Verkehr und CPU-Berechnung minimiert. Am wichtigsten ist, dass TIDE eine verlustfreie Optimierung ist, die keine Modelltraining erfordert, und bietet eine „kostenlose Beschleunigung“ für dLLM-Inferenz. In einem einzelnen GPU-CPU-System zeigen wir, dass TIDE bis zu 1,4× und 1,5× Durchsatzverbesserungen gegenüber vorherigen Benchmarks auf den Modellen LLaDA2.0-mini und LLaDA2.0-flash erreicht."
      }
    },
    {
      "arxivId": "2605.20177v1",
      "title": "From Seeing to Thinking: Decoupling Perception and Reasoning Improves Post-Training of Vision-Language Models",
      "summary": "Recent advances in vision-language models (VLMs) emphasize long chain-of-thought reasoning; yet, we find that their performance on visual tasks is primarily limited by a lack of visual perception as opposed to reasoning itself. In this work, we systematically study the interplay between perception and reasoning in VLM post-training by decomposing their capabilities into three separate training stages: visual perception, visual reasoning, and textual reasoning, incorporating specialized training data. We demonstrate that visual perception (a) requires targeted optimization with specialized data; (b) serves as a fundamental scaffold that should be solidified through staged training before refining visual reasoning; and (c) is more effectively learned via RL than caption-based SFT. Our experiments across multiple VLMs demonstrate that staged training consistently improves both visual perception and reasoning performance over merged training. Notably, models trained with our approach achieve 1.5% higher reasoning accuracy with 20.8% shorter reasoning traces, suggesting that superior perception reduces the need for excessive reasoning. Furthermore, we show that this capability-based staging represents a new curriculum dimension orthogonal to traditional difficulty-based curricula, and combining both yields further additive gains. Our staged-training models achieve superior performance among open-weight VLMs, establishing advanced results on several visual math and perception (e.g., +5.2% on WeMath and +3.7% on RealWorldQA) tasks compared with the base counterpart.",
      "authors": [
        "Juncheng Wu",
        "Hardy Chen",
        "Haoqin Tu",
        "Xianfeng Tang",
        "Freda Shi",
        "Hui Liu"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T17:58:40Z",
      "submittedAt": "2026-05-19T17:58:40Z",
      "upvotes": 0,
      "comments": 0,
      "thumbnail": null,
      "arxivUrl": "https://arxiv.org/abs/2605.20177v1",
      "pdfUrl": "https://arxiv.org/pdf/2605.20177v1",
      "hfUrl": "https://arxiv.org/abs/2605.20177v1",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "cs.CL",
        "cs.CV"
      ],
      "source": "arXiv cs.CL",
      "sourceType": "arxiv",
      "title_zh": "从视觉到思考：解耦感知与推理提升视觉语言模型的后训练",
      "summary_zh": "通过分阶段训练解耦视觉感知与推理，提升视觉语言模型性能，实现推理准确率提高1.5%且推理轨迹缩短20.8%。",
      "title_i18n": {
        "en": "From Seeing to Thinking: Decoupling Perception and Reasoning Improves Post-Training of Vision-Language Models",
        "zh-CN": "从视觉到思考：解耦感知与推理提升视觉语言模型的后训练",
        "ja": "From Seeing to Thinking: Decoupling Perception and Reasoning Improves Post-Training of Vision-Language Models",
        "ko": "From Seeing to Thinking: Decoupling Perception and Reasoning Improves Post-Training of Vision-Language Models",
        "es": "De ver a pensar: Separar percepción y razonamiento mejora el post-entrenamiento de modelos visión-lenguaje",
        "de": "Von Sehen zum Denken: Die Entkoppelung von Wahrnehmung und Schlussfolgerung verbessert die Nachtraining von Vision-Language-Modellen"
      },
      "summary_i18n": {
        "en": "Recent advances in vision-language models (VLMs) emphasize long chain-of-thought reasoning; yet, we find that their performance on visual tasks is primarily limited by a lack of visual perception as opposed to reasoning itself. In this work, we systematically study the interplay between perception and reasoning in VLM post-training by decomposing their capabilities into three separate training stages: visual perception, visual reasoning, and textual reasoning, incorporating specialized training data. We demonstrate that visual perception (a) requires targeted optimization with specialized data; (b) serves as a fundamental scaffold that should be solidified through staged training before refining visual reasoning; and (c) is more effectively learned via RL than caption-based SFT. Our experiments across multiple VLMs demonstrate that staged training consistently improves both visual perception and reasoning performance over merged training. Notably, models trained with our approach achieve 1.5% higher reasoning accuracy with 20.8% shorter reasoning traces, suggesting that superior perception reduces the need for excessive reasoning. Furthermore, we show that this capability-based staging represents a new curriculum dimension orthogonal to traditional difficulty-based curricula, and combining both yields further additive gains. Our staged-training models achieve superior performance among open-weight VLMs, establishing advanced results on several visual math and perception (e.g., +5.2% on WeMath and +3.7% on RealWorldQA) tasks compared with the base counterpart.",
        "zh-CN": "通过分阶段训练解耦视觉感知与推理，提升视觉语言模型性能，实现推理准确率提高1.5%且推理轨迹缩短20.8%。",
        "ja": "視覚言語モデル（VLMs）の最近の進歩は、長く続く思考の連鎖を強調していますが、我々は視覚タスクにおけるそのパフォーマンスが、主に推論自体ではなく視覚的認識の欠如によって制限されていることを発見しました。この研究では、視覚的認識と推論の相互作用を体系的に研究し、彼らの能力を3つの別々なトレーニングステージに分解して、専門的なトレーニングデータを組み込みました。私たちは、視覚的認識（a）専門的なデータによるターゲット最適化を必要とする；（b）視覚的推論を精緻化する前に、しっかりとした段階的なトレーニングで固めるべき基本的な枠組みである；（c）キャプションベースのSFTよりもRLにより効果的に学べるという点を示しました。複数のVLM上で行った実験では、段階的なトレーニングが統合トレーニングよりも視覚的認識と推論の両方のパフォーマンスを一貫して向上させることを示しました。特に、我々のアプローチでトレーニングされたモデルは、推論トレースが20.8%短縮され、1.5%高い推論精度を達成しており、優れた認識が過剰な推論の必要性を減らすことを示しています。さらに、この能力ベースの段階化は、従来の難易度ベースのカリキュラムとは直交する新しいカリキュラムの次元を表しており、両方を組み合わせるとさらなる追加利益を得られます。段階的トレーニングモデルは、オープン重みVLMの中で優れたパフォーマンスを達成し、ベースモデルと比較していくつかの視覚数学および認識（例：WeMathで+5.2%、RealWorldQAで+3.7%）タスクで先進的な結果を確立しています。",
        "ko": "시각-언어 모델(VLMs)의 최근 발전은 장기적인 사고 체인 추론을 강조하지만, 시각 작업에서의 성능은 추론 자체보다 시각 인식 부족에 의해 주로 제한됩니다. 본 연구에서는 시각 인식과 추론의 상호작용을 체계적으로 연구하며, 세 가지 별도 학습 단계: 시각 인식, 시각 추론, 텍스트 추론을 포함하여 특수 학습 데이터를 사용합니다. 우리는 시각 인식(a)이 특수 데이터로 표적 최적화가 필요하며, (b) 시각 추론을 개선하기 전에 단계별 학습을 통해 견고하게 해야 하며, (c) 캡션 기반 SFT보다 RL을 통해 더 효과적으로 학습된다고 보여줍니다. 여러 VLMs에서의 실험 결과는 단계별 학습이 병합 학습보다 시각 인식과 추론 성능을 일관되게 개선함을 보여줍니다. 특히, 우리의 방법으로 학습한 모델은 1.5% 더 높은 추론 정확도와 20.8% 짧은 추론 추적을 달성하며, 우수한 인식은 과도한 추론의 필요성을 줄입니다. 또한, 이 능력 기반 단계는 전통적인 어려움 기반 커리큘럼과 직교하는 새로운 커리큘럼 차원을 나타내며, 두 가지를 결합하면 추가적인 이점을 제공합니다. 단계별 학습 모델은 오픈 무게 VLMs 중에서 우수한 성능을 보이며, WeMath 및 RealWorldQA(예: WeMath +5.2%, RealWorldQA +3.7%)와 같은 여러 시각 수학 및 인식 작업에서 기본 대조군보다 우수한 결과를 달성합니다.",
        "es": "Avances recientes en modelos visión-lenguaje (VLMs) enfatizan el razonamiento de cadena larga; sin embargo, encontramos que su rendimiento en tareas visuales está principalmente limitado por una falta de percepción visual en lugar de razonamiento mismo. En este trabajo, estudiamos sistemáticamente la interacción entre percepción y razonamiento en el post-entrenamiento de VLMs al descomponer sus capacidades en tres etapas de entrenamiento separadas: percepción visual, razonamiento visual y razonamiento textual, incorporando datos de entrenamiento especializados. Demostramos que la percepción visual (a) requiere optimización específica con datos especializados; (b) sirve como andamiaje fundamental que debe consolidarse mediante entrenamiento escalonado antes de refinar el razonamiento visual; y (c) se aprende más eficazmente mediante RL que mediante SFT basado en descripciones. Nuestros experimentos en múltiples VLMs demuestran que el entrenamiento escalonado mejora consistentemente tanto el rendimiento de percepción visual como de razonamiento en comparación con el entrenamiento combinado. Notablemente, los modelos entrenados con nuestro enfoque alcanzan una precisión de razonamiento 1.5% mayor con trazas de razonamiento 20.8% más cortas, sugiriendo que una mejor percepción reduce la necesidad de razonamiento excesivo. Además, mostramos que esta segmentación basada en capacidades representa una nueva dimensión de currículo ortogonal a los tradicionales curricula basados en dificultad, y combinar ambos da ganancias aditivas adicionales. Nuestros modelos de entrenamiento escalonado alcanzan un mejor rendimiento entre VLMs con pesos abiertos, estableciendo resultados avanzados en varias tareas de matemáticas visuales y percepción (por ejemplo, +5.2% en WeMath y +3.7% en RealWorldQA) en comparación con el modelo base.",
        "de": "Neue Fortschritte in Vision-Language-Modellen (VLMs) betonen lange Ketten von Gedanken; doch wir finden, dass ihre Leistung bei visuellen Aufgaben hauptsächlich durch fehlende visuelle Wahrnehmung statt durch Schlussfolgerung selbst begrenzt ist. In dieser Arbeit untersuchen wir systematisch den Wechselwirkung zwischen Wahrnehmung und Schlussfolgerung in VLM-Nachtraining, indem wir ihre Fähigkeiten in drei separate Trainingsphasen zerlegen: visuelle Wahrnehmung, visuelle Schlussfolgerung und textuelle Schlussfolgerung, mit spezialisierten Trainingsdaten. Wir zeigen, dass visuelle Wahrnehmung (a) gezielte Optimierung mit spezialisierten Daten benötigt; (b) eine grundlegende Grundlage ist, die durch stufenweises Training gefestigt werden sollte, bevor visuelle Schlussfolgerung verfeinert wird; und (c) effektiver durch RL als durch SFT-basierte Caption-Lernen gelernt wird. Unsere Experimente an mehreren VLMs zeigen, dass stufenweises Training die Leistung bei visueller Wahrnehmung und Schlussfolgerung über gemischtes Training konstant verbessert. Besonders bemerkenswert ist, dass Modelle mit unserem Ansatz eine um 1,5 % höhere Schlussfolgerungsgenauigkeit mit 20,8 % kürzeren Schlussfolgerungspfaden erreichen, was darauf hindeutet, dass bessere Wahrnehmung den Bedarf an übermäßiger Schlussfolgerung verringert. Darüber hinaus zeigen wir, dass diese fähigkeitsbasierte Stufung eine neue Curriculum-Dimension darstellt, die orthogonal zu traditionellen Schwierigkeitscurricula steht, und die Kombination beider ergibt weitere additive Gewinne. Unsere stufenweisen Trainingsmodelle erreichen bessere Leistung unter offenen-VLM-Modellen und erzielen fortgeschrittene Ergebnisse bei mehreren visuellen Mathematik- und Wahrnehmungsaufgaben (z. B. +5,2 % bei WeMath und +3,7 % bei RealWorldQA) im Vergleich zum Basismodell."
      }
    },
    {
      "arxivId": "2605.20176v1",
      "title": "ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning",
      "summary": "Large language models (LLMs) and agentic systems have shown promise for clinical decision support, but existing works largely assume that evidence has already been curated and handed to the model. Real-world clinical workflows instead require agents to actively seek, iteratively plan, and synthesize multimodal evidence from heterogeneous sources. In this paper, we introduce ClinSeekAgent, an automated agentic framework for dynamic multimodal evidence seeking that shifts the paradigm from passive evidence consumption to active evidence acquisition. Given only a clinical query and access to raw data sources, ClinSeekAgent gathers evidence by querying medical knowledge bases, navigating raw EHRs, and invoking medical imaging tools; refines its hypotheses as new information emerges; and integrates the collected evidence into grounded clinical decisions. ClinSeekAgent serves both as an inference-time agent for frontier LLMs and as a training-time pipeline for distilling high-quality agent trajectories into compact open-source models. To validate its inference-time effectiveness, we construct ClinSeek-Bench, which pairs Curated Input reasoning from fixed pre-selected evidence with Automated Evidence-Seeking over raw clinical data. On text-only EHR tasks, ClinSeekAgent improves Claude Opus 4.6 from 60.0 to 63.2 overall F1 and MiniMax M2.5 from 43.1 to 47.3, with positive risk-prediction gains in 7 out of 9 evaluated host models. On multimodal tasks, ClinSeekAgent improves Claude Opus 4.6 from 47.5 to 62.6 (+15.1); all evaluated models improve across the three CXR-related task groups. We further validate ClinSeekAgent as a training pipeline by distilling agentic evidence-seeking trajectories into ClinSeek-35B-A3B, which achieves 34.0 average F1 on existing AgentEHR-Bench, improving over its Qwen3.5-35B-A3B baseline by +11.9 points and approaching Claude Opus 4.6.",
      "authors": [
        "Juncheng Wu",
        "Letian Zhang",
        "Yuhan Wang",
        "Haoqin Tu",
        "Hardy Chen",
        "Zijun Wang"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T17:58:37Z",
      "submittedAt": "2026-05-19T17:58:37Z",
      "upvotes": 0,
      "comments": 0,
      "thumbnail": null,
      "arxivUrl": "https://arxiv.org/abs/2605.20176v1",
      "pdfUrl": "https://arxiv.org/pdf/2605.20176v1",
      "hfUrl": "https://arxiv.org/abs/2605.20176v1",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "cs.CL"
      ],
      "source": "arXiv cs.CL",
      "sourceType": "arxiv",
      "title_zh": "ClinSeekAgent：自动化多模态证据获取的代理临床推理框架",
      "summary_zh": "ClinSeekAgent实现多模态证据主动获取，提升临床决策效果，改进F1指标并优于基线模型。",
      "title_i18n": {
        "en": "ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning",
        "zh-CN": "ClinSeekAgent：自动化多模态证据获取的代理临床推理框架",
        "ja": "ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning",
        "ko": "ClinSeekAgent: Agentic Clinical Reasoning을 위한 다중모달 증거 탐색 자동화",
        "es": "ClinSeekAgent: Automating Multimodal Evidence Seeking for Agentic Clinical Reasoning",
        "de": "ClinSeekAgent: Automatisieren des multimodalen Beweissuchens für agen-tische klinische Reasoning"
      },
      "summary_i18n": {
        "en": "Large language models (LLMs) and agentic systems have shown promise for clinical decision support, but existing works largely assume that evidence has already been curated and handed to the model. Real-world clinical workflows instead require agents to actively seek, iteratively plan, and synthesize multimodal evidence from heterogeneous sources. In this paper, we introduce ClinSeekAgent, an automated agentic framework for dynamic multimodal evidence seeking that shifts the paradigm from passive evidence consumption to active evidence acquisition. Given only a clinical query and access to raw data sources, ClinSeekAgent gathers evidence by querying medical knowledge bases, navigating raw EHRs, and invoking medical imaging tools; refines its hypotheses as new information emerges; and integrates the collected evidence into grounded clinical decisions. ClinSeekAgent serves both as an inference-time agent for frontier LLMs and as a training-time pipeline for distilling high-quality agent trajectories into compact open-source models. To validate its inference-time effectiveness, we construct ClinSeek-Bench, which pairs Curated Input reasoning from fixed pre-selected evidence with Automated Evidence-Seeking over raw clinical data. On text-only EHR tasks, ClinSeekAgent improves Claude Opus 4.6 from 60.0 to 63.2 overall F1 and MiniMax M2.5 from 43.1 to 47.3, with positive risk-prediction gains in 7 out of 9 evaluated host models. On multimodal tasks, ClinSeekAgent improves Claude Opus 4.6 from 47.5 to 62.6 (+15.1); all evaluated models improve across the three CXR-related task groups. We further validate ClinSeekAgent as a training pipeline by distilling agentic evidence-seeking trajectories into ClinSeek-35B-A3B, which achieves 34.0 average F1 on existing AgentEHR-Bench, improving over its Qwen3.5-35B-A3B baseline by +11.9 points and approaching Claude Opus 4.6.",
        "zh-CN": "ClinSeekAgent实现多模态证据主动获取，提升临床决策效果，改进F1指标并优于基线模型。",
        "ja": "大規模言語モデルとエージェントシステムは臨床意思決定支援に有望だが、既存の研究では証拠がすでに整えられていると仮定されている。実際の臨床ワークフローではエージェントが積極的に証拠を収集し、反復的に計画し、統合する必要がある。本論文では、動的なマルチモーダル証拠収集を自動化するClinSeekAgentを紹介する。",
        "ko": "LLMs와 에이전트 시스템은 임상 결정 지원에 가능성을 보였으나, 기존 연구는 이미 정제된 증거를 가정합니다. 실제 임상 워크플로우에서는 에이전트가 적극적으로 증거를 수집해야 합니다. 본 논문에서는 ClinSeekAgent를 소개합니다.",
        "es": "Introduce ClinSeekAgent, an automated framework for dynamic multimodal evidence seeking in clinical reasoning. It improves F1 scores and risk prediction.",
        "de": "ClinSeekAgent ist ein automatisiertes agen-tisches Framework zur dynamischen multimodalen Beweissuche, das den Paradigmenwechsel von passiver zu aktiver Beweisakquise ermöglicht."
      }
    },
    {
      "arxivId": "2605.20170v1",
      "title": "KoRe: Compact Knowledge Representations for Large Language Models",
      "summary": "Modern Large Language Models (LLMs) have shown impressive performances in user-facing tasks such as question answering, as well as consistent improvements in reasoning capabilities. Still, the way these models encode knowledge seems inherently flawed: by design, LLMs encode world-knowledge within their parameters. This way of representing knowledge is inherently opaque, difficult to debug and update, and prone to hallucinations. On the other hand, Knowledge Graphs can provide human-readable and easily editable world knowledge representations, and their application in knowledge-intensive tasks has consistently proven beneficial to downstream performance. Nonetheless, current integration techniques require extensive retraining or finetuning. To overcome this issue, we introduce KoRe, a methodology to encode 1-hop sub-graphs into compact discrete knowledge tokens and inject them into a LLM backbone. We test the proposed approach on three established benchmarks, and report competitive performances coupled with a significant reduction (up to 10x) in token usage. Our results show that compact discrete KG representations can efficiently and effectively be used to ground modern LLMs.",
      "authors": [
        "Davide Cavicchini",
        "Fausto Giunchiglia",
        "Jacopo Staiano"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T17:53:29Z",
      "submittedAt": "2026-05-19T17:53:29Z",
      "upvotes": 0,
      "comments": 0,
      "thumbnail": null,
      "arxivUrl": "https://arxiv.org/abs/2605.20170v1",
      "pdfUrl": "https://arxiv.org/pdf/2605.20170v1",
      "hfUrl": "https://arxiv.org/abs/2605.20170v1",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "cs.CL"
      ],
      "source": "arXiv cs.CL",
      "sourceType": "arxiv",
      "title_zh": "KoRe：大型语言模型的紧凑知识表示",
      "summary_zh": "提出KoRe方法，将1跳子图编码为离散知识标记注入LLM，提升性能并减少10倍token使用。",
      "title_i18n": {
        "en": "KoRe: Compact Knowledge Representations for Large Language Models",
        "zh-CN": "KoRe：大型语言模型的紧凑知识表示",
        "ja": "KoRe: Compact Knowledge Representations for Large Language Models",
        "ko": "KoRe: 대규모 언어 모델을 위한 컴팩트 지식 표현",
        "es": "KoRe: Compact Knowledge Representations for Large Language Models",
        "de": "KoRe: Kompakte Wissensdarstellungen für große Sprachmodelle"
      },
      "summary_i18n": {
        "en": "Modern Large Language Models (LLMs) have shown impressive performances in user-facing tasks such as question answering, as well as consistent improvements in reasoning capabilities. Still, the way these models encode knowledge seems inherently flawed: by design, LLMs encode world-knowledge within their parameters. This way of representing knowledge is inherently opaque, difficult to debug and update, and prone to hallucinations. On the other hand, Knowledge Graphs can provide human-readable and easily editable world knowledge representations, and their application in knowledge-intensive tasks has consistently proven beneficial to downstream performance. Nonetheless, current integration techniques require extensive retraining or finetuning. To overcome this issue, we introduce KoRe, a methodology to encode 1-hop sub-graphs into compact discrete knowledge tokens and inject them into a LLM backbone. We test the proposed approach on three established benchmarks, and report competitive performances coupled with a significant reduction (up to 10x) in token usage. Our results show that compact discrete KG representations can efficiently and effectively be used to ground modern LLMs.",
        "zh-CN": "提出KoRe方法，将1跳子图编码为离散知识标记注入LLM，提升性能并减少10倍token使用。",
        "ja": "現代の大規模言語モデルは質問応答などのタスクで優れた性能を示すが、知識の表現方法には欠点がある。本研究では、コンパクトな知識トークンを用いたKoReを提案し、効率的な知識表現を実現する。",
        "ko": "현대 LLM은 질문 응답과 같은 사용자 작업에서 뛰어난 성능을 보입니다. 그러나 이 모델들이 지식을 인코딩하는 방식은 본질적으로 문제가 있습니다. KoRe는 1단계 하이퍼그래프를 컴팩트한 디지털 지식 토큰으로 인코딩합니다.",
        "es": "KoRe encodes knowledge graphs into compact tokens for LLMs, improving performance with reduced token usage and better explainability.",
        "de": "KoRe ist eine Methode zur Kodierung von 1-Hop-Untergraphen in kompakte diskrete Wissens-Token und deren Einbettung in ein LLM-Backbone."
      }
    },
    {
      "arxivId": "2605.15141",
      "title": "Causal Forcing++: Scalable Few-Step Autoregressive Diffusion Distillation for Real-Time Interactive Video Generation",
      "summary": "A novel causal consistency distillation method enables efficient frame-wise video generation with reduced latency and improved quality compared to existing chunk-wise approaches.",
      "authors": [
        "Min Zhao",
        "Hongzhou Zhu",
        "Kaiwen Zheng",
        "Zihan Zhou",
        "Bokai Yan",
        "Xinyuan Li"
      ],
      "organization": {
        "_id": "640d3084536d9fe0f005cac3",
        "name": "thu-ml",
        "fullname": "Tsinghua Machine Learning Group",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/1678587085174-633131798ef21f47308ce49b.jpeg"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 88,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15141.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15141",
      "pdfUrl": "https://arxiv.org/pdf/2605.15141.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15141",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "autoregressive diffusion",
        "causal consistency distillation",
        "causal CD",
        "frame-wise autoregression",
        "few-step AR initialization",
        "diffusion distillation"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Causal Forcing++：用于实时交互视频生成的可扩展少步自回归扩散蒸馏",
      "summary_zh": "提出一种因果一致性蒸馏方法，实现更高效帧级视频生成，降低延迟并提升质量",
      "title_i18n": {
        "en": "Causal Forcing++: Scalable Few-Step Autoregressive Diffusion Distillation for Real-Time Interactive Video Generation",
        "zh-CN": "Causal Forcing++：用于实时交互视频生成的可扩展少步自回归扩散蒸馏",
        "ja": "Causal Forcing++: Scalable Few-Step Autoregressive Diffusion Distillation for Real-Time Interactive Video Generation",
        "ko": "Causal Forcing++: 실시간 인터랙티브 비디오 생성을 위한 확장 가능한 소수 단계 자기회귀 확산 분산",
        "es": "Causal Forcing++: Scalable Few-Step Autoregressive Diffusion Distillation for Real-Time Interactive Video Generation",
        "de": "Causal Forcing++: Skalierbare Few-Step Autoregressive Diffusions-Distillation für Echtzeit-Interaktive Videoerzeugung"
      },
      "summary_i18n": {
        "en": "A novel causal consistency distillation method enables efficient frame-wise video generation with reduced latency and improved quality compared to existing chunk-wise approaches.",
        "zh-CN": "提出一种因果一致性蒸馏方法，实现更高效帧级视频生成，降低延迟并提升质量",
        "ja": "新しい因果的整合性蒸留法により、従来のチャンク単位アプローチよりも低遅延かつ高品質なフレーム単位ビデオ生成が可能になる。",
        "ko": "새로운 인과 일관성 분산 방법은 기존의 청크 기반 접근보다 더 낮은 지연과 개선된 품질로 프레임별 비디오 생성을 가능하게 합니다.",
        "es": "A novel method for efficient video generation with reduced latency and improved quality compared to chunk-wise approaches.",
        "de": "Eine neue kausale Konsistenz-Distillation-Methode ermöglicht effiziente Frame-für-Frame-Videoerzeugung mit reduziertem Latenz und verbesserter Qualität."
      }
    },
    {
      "arxivId": "2605.13831",
      "title": "Training Long-Context Vision-Language Models Effectively with Generalization Beyond 128K Context",
      "summary": "Long-context continued pre-training enhances vision-language models' ability to handle extended documents while maintaining performance across diverse contexts through strategic data mixture design.",
      "authors": [
        "Zhaowei Wang",
        "Lishu Luo",
        "Haodong Duan",
        "Weiwei Liu",
        "Sijin Wu",
        "Ji Luo"
      ],
      "organization": {
        "_id": "67d1140985ea0644e2f14b99",
        "name": "ByteDance-Seed",
        "fullname": "ByteDance Seed",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6535c9e88bde2fae19b6fb25/flkDUqd_YEuFsjeNET3r-.png"
      },
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 85,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13831.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13831",
      "pdfUrl": "https://arxiv.org/pdf/2605.13831.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13831",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "long-context modeling",
        "large vision-language models",
        "continued pre-training",
        "long-document VQA",
        "sequence-length distribution",
        "retrieval-heavy mixtures"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "使用GPT/LoRA有效训练长上下文视觉-语言模型",
      "summary_zh": "通过数据混合设计，提升GPT/LoRA在128K以上上下文的泛化能力",
      "title_i18n": {
        "en": "Training Long-Context Vision-Language Models Effectively with Generalization Beyond 128K Context",
        "zh-CN": "使用GPT/LoRA有效训练长上下文视觉-语言模型",
        "ja": "Training Long-Context Vision-Language Models Effectively with Generalization Beyond 128K Context",
        "ko": "128K 이상의 일반화를 통한 장기적 컨텍스트 비전-언어 모델 훈련",
        "es": "Training Long-Context Vision-Language Models Effectively with Generalization Beyond 128K Context",
        "de": "Effektives Trainieren von Lang-Kontext Vision-Language-Modellen mit Generalisierung über 128K Kontext"
      },
      "summary_i18n": {
        "en": "Long-context continued pre-training enhances vision-language models' ability to handle extended documents while maintaining performance across diverse contexts through strategic data mixture design.",
        "zh-CN": "通过数据混合设计，提升GPT/LoRA在128K以上上下文的泛化能力",
        "ja": "長文脈継続事前学習により、視覚言語モデルは広範な文脈で高い性能を維持しながら、長いドキュメントを処理できるようになる。",
        "ko": "장기적 연속 사전 훈련은 전문 문서 처리 능력을 향상시키며 다양한 컨텍스트에서 성능을 유지합니다.",
        "es": "Long-context pre-training enhances vision-language models' ability to handle extended documents and diverse contexts.",
        "de": "Lang-kontextuelle fortgesetzte Vortrainierung verbessert die Fähigkeit von Vision-Language-Modellen, erweiterte Dokumente zu verarbeiten."
      }
    },
    {
      "arxivId": "2605.19577",
      "title": "GoLongRL: Capability-Oriented Long Context Reinforcement Learning with Multitask Alignment",
      "summary": "GoLongRL presents an open-source approach for long-context reinforcement learning with diverse reward optimization through capability-oriented data construction and TMN-Reweight methodology.",
      "authors": [
        "Minxuan Lv",
        "Tiehua Mei",
        "Tanlong Du",
        "Junmin Chen",
        "Zhenpeng Su",
        "Ziyang Chen"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T00:00:00.000Z",
      "submittedAt": "2026-05-20T00:00:00.000Z",
      "upvotes": 29,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.19577.png",
      "arxivUrl": "https://arxiv.org/abs/2605.19577",
      "pdfUrl": "https://arxiv.org/pdf/2605.19577.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.19577",
      "githubRepo": "https://github.com/xiaoxuanNLP/GoLongRL",
      "githubStars": 7,
      "keywords": [
        "reinforcement learning",
        "long-context",
        "verifiable rewards",
        "post-training recipe",
        "RLVR",
        "GRPO"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "GoLongRL：基于多任务对齐的长上下文强化学习",
      "summary_zh": "提出GoLongRL方法，通过能力导向数据构建和TMN-Reweight实现长上下文强化学习的多样化奖励优化",
      "title_i18n": {
        "en": "GoLongRL: Capability-Oriented Long Context Reinforcement Learning with Multitask Alignment",
        "zh-CN": "GoLongRL：基于多任务对齐的长上下文强化学习",
        "ja": "GoLongRL: Capability-Oriented Long Context Reinforcement Learning with Multitask Alignment",
        "ko": "GoLongRL: 다중태스크 정렬을 통한 능력 중심의 장기적 컨텍스트 강화 학습",
        "es": "GoLongRL: Capability-Oriented Long Context Reinforcement Learning with Multitask Alignment",
        "de": "GoLongRL: Leistungsfokussierte Lang-Kontext Reinforcement Learning mit Multitask-Ausrichtung"
      },
      "summary_i18n": {
        "en": "GoLongRL presents an open-source approach for long-context reinforcement learning with diverse reward optimization through capability-oriented data construction and TMN-Reweight methodology.",
        "zh-CN": "提出GoLongRL方法，通过能力导向数据构建和TMN-Reweight实现长上下文强化学习的多样化奖励优化",
        "ja": "GoLongRLは、多タスク調整を通じて能力指向のデータ構築とTMN-Reweight手法を用いた長文脈強化学習のオープンソースアプローチを提示する。",
        "ko": "GoLongRL은 능력 중심의 데이터 구축과 TMN-Reweight 방법론을 통해 다양한 보상 최적화를 위한 오픈소스 접근법을 제시합니다.",
        "es": "GoLongRL offers an open-source approach for long-context reinforcement learning with multitask alignment and reward optimization.",
        "de": "GoLongRL bietet einen Open-Source-Ansatz für lang-kontextuelles Reinforcement Learning mit vielfältiger Belohnungsoptimierung."
      }
    },
    {
      "arxivId": "2605.17283",
      "title": "OProver: A Unified Framework for Agentic Formal Theorem Proving",
      "summary": "OProver is a unified framework for agentic formal theorem proving in Lean 4 that improves proof generation through iterative training with verified proofs and compiler feedback.",
      "authors": [
        "David Ma",
        "Kaijing Ma",
        "Shawn Guo",
        "Yunfeng Shi",
        "Enduo Zhao",
        "Jiajun Shi"
      ],
      "organization": {
        "_id": "6384ee7fdfffab482400b938",
        "name": "m-a-p",
        "fullname": "Multimodal Art Projection",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6382252f54421460665ec501/oNH4MDqpSiMWJpxbaSLOv.png"
      },
      "publishedAt": "2026-05-17T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 28,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.17283.png",
      "arxivUrl": "https://arxiv.org/abs/2605.17283",
      "pdfUrl": "https://arxiv.org/pdf/2605.17283.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.17283",
      "githubRepo": "https://github.com/multimodal-art-projection/OProver",
      "githubStars": 7,
      "keywords": [
        "agentic formal theorem proving",
        "Lean 4",
        "proof generation",
        "verifier-aware training",
        "continued pretraining",
        "iterative post-training"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "OProver：一种统一的代理形式化定理证明框架",
      "summary_zh": "OProver通过迭代训练和编译器反馈提升Lean 4中的形式化定理证明生成效果。",
      "title_i18n": {
        "en": "OProver: A Unified Framework for Agentic Formal Theorem Proving",
        "zh-CN": "OProver：一种统一的代理形式化定理证明框架",
        "ja": "OProver: A Unified Framework for Agentic Formal Theorem Proving",
        "ko": "OProver: Agentic 공식 정리 증명을 위한 통합 프레임워크",
        "es": "OProver: A Unified Framework for Agentic Formal Theorem Proving",
        "de": "OProver: Einheitlicher Rahmen für agen-tisches formales Theorembeweisen"
      },
      "summary_i18n": {
        "en": "OProver is a unified framework for agentic formal theorem proving in Lean 4 that improves proof generation through iterative training with verified proofs and compiler feedback.",
        "zh-CN": "OProver通过迭代训练和编译器反馈提升Lean 4中的形式化定理证明生成效果。",
        "ja": "OProverはLean 4におけるエージェント形式定理証明の統一フレームワークであり、検証済み証明とコンパイラフィードバックによる反復トレーニングで証明生成を改善する。",
        "ko": "OProver는 Lean 4에서 반복적인 훈련과 검증된 증명 및 컴파일러 피드백을 통해 증명 생성을 개선하는 통합 프레임워크입니다.",
        "es": "OProver is a unified framework for agentic formal theorem proving in Lean 4, improving proof generation through iterative training.",
        "de": "OProver ist ein einheitlicher Rahmen für agen-tisches formales Theorembeweisen in Lean 4, der durch iterative Training mit verifizierten Beweisen verbessert wird."
      }
    },
    {
      "arxivId": "2605.18703",
      "title": "EnvFactory: Scaling Tool-Use Agents via Executable Environments Synthesis and Robust RL",
      "summary": "EnvFactory automates the creation of executable tool environments and natural multi-turn trajectories for training LLMs with agentic reinforcement learning, achieving superior performance with fewer resources.",
      "authors": [
        "Minrui Xu",
        "Zilin Wang",
        "Mengyi DENG",
        "Zhiwei Li",
        "Zhicheng Yang",
        "Xiao Zhu"
      ],
      "organization": {
        "_id": "6980a3aede8ee5f0a7de0007",
        "name": "LARK-Lab",
        "fullname": "LARK Lab@HKUST (GZ)",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/63b6af3accebeadccc868efd/H6b3XExLG87O3ZFPV7Pr5.png"
      },
      "publishedAt": "2026-05-18T00:00:00.000Z",
      "submittedAt": "2026-05-20T00:00:00.000Z",
      "upvotes": 23,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.18703.png",
      "arxivUrl": "https://arxiv.org/abs/2605.18703",
      "pdfUrl": "https://arxiv.org/pdf/2605.18703.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.18703",
      "githubRepo": "https://github.com/LARK-AI-Lab/EnvFactory",
      "githubStars": 16,
      "keywords": [
        "Agentic Reinforcement Learning",
        "tool-use capabilities",
        "execution environments",
        "synthetic trajectories",
        "topology-aware sampling",
        "calibrated refinement"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "EnvFactory：通过可执行环境合成与鲁棒强化学习扩展工具使用代理",
      "summary_zh": "EnvFactory自动化生成可执行工具环境和多轮轨迹，以更少资源实现LLMs的优越性能。",
      "title_i18n": {
        "en": "EnvFactory: Scaling Tool-Use Agents via Executable Environments Synthesis and Robust RL",
        "zh-CN": "EnvFactory：通过可执行环境合成与鲁棒强化学习扩展工具使用代理",
        "ja": "EnvFactory: Scaling Tool-Use Agents via Executable Environments Synthesis and Robust RL",
        "ko": "EnvFactory: 실행 가능한 환경 생성 및 강건한 RL을 통한 도구 사용 에이전트 확장",
        "es": "EnvFactory: Scaling Tool-Use Agents via Executable Environments Synthesis and Robust RL",
        "de": "EnvFactory: Skalierung von Tool-Use-Agenten durch ausführbare Umgebungssynthese und robuste RL"
      },
      "summary_i18n": {
        "en": "EnvFactory automates the creation of executable tool environments and natural multi-turn trajectories for training LLMs with agentic reinforcement learning, achieving superior performance with fewer resources.",
        "zh-CN": "EnvFactory自动化生成可执行工具环境和多轮轨迹，以更少资源实现LLMs的优越性能。",
        "ja": "EnvFactoryは、実行可能なツール環境と自然な複数ターントラジェクトリを自動生成し、エージェント強化学習でのLLMトレーニングに優れた性能を達成する。",
        "ko": "EnvFactory는 LLM을 위한 실행 가능한 도구 환경과 자연스러운 멀티턴 트래잭션을 자동 생성하여 더 적은 자원으로 우수한 성능을 달성합니다.",
        "es": "EnvFactory automates the creation of tool environments and trajectories for training LLMs with agentic reinforcement learning.",
        "de": "EnvFactory automatisiert die Erstellung ausführbarer Werkzeugumgebungen und natürlicher Mehrphasen-Trajektorien für das Training von LLMs."
      }
    },
    {
      "arxivId": "2605.15178",
      "title": "SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer",
      "summary": "SANA-WM is an efficient 2.6B-parameter world model that generates high-fidelity 720p videos with precise camera control, achieving industrial-level quality while significantly reducing computational requirements through hybrid attention, dual-camera branches, two-stage generation, and robust annotation pipelines.",
      "authors": [
        "Haoyi Zhu",
        "Haozhe Liu",
        "Yuyang Zhao",
        "Tian Ye",
        "Junsong Chen",
        "Jincheng Yu"
      ],
      "organization": {
        "_id": "60262b67268c201cdc8b7d43",
        "name": "nvidia",
        "fullname": "NVIDIA",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65df9200dc3292a8983e5017/Vs5FPVCH-VZBipV3qKTuy.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 76,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15178.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15178",
      "pdfUrl": "https://arxiv.org/pdf/2605.15178.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15178",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "world model",
        "Gated DeltaNet",
        "softmax attention",
        "6-DoF trajectory",
        "two-stage generation pipeline",
        "metric-scale pose supervision"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "SANA-WM：基于混合线性扩散Transformer的高效分钟级世界建模",
      "summary_zh": "SANA-WM通过混合注意力、双摄像头分支等方法实现高效视频生成，达到工业级质量。",
      "title_i18n": {
        "en": "SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer",
        "zh-CN": "SANA-WM：基于混合线性扩散Transformer的高效分钟级世界建模",
        "ja": "SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer",
        "ko": "SANA-WM: 하이브리드 선형 확산 트랜스포머를 이용한 효율적인 분단위 세계 모델링",
        "es": "SANA-WM: Efficient Minute-Scale World Modeling with Hybrid Linear Diffusion Transformer",
        "de": "SANA-WM: Effiziente Minute-Skalen Weltmodellierung mit hybrider linearer Diffusions-Transformer"
      },
      "summary_i18n": {
        "en": "SANA-WM is an efficient 2.6B-parameter world model that generates high-fidelity 720p videos with precise camera control, achieving industrial-level quality while significantly reducing computational requirements through hybrid attention, dual-camera branches, two-stage generation, and robust annotation pipelines.",
        "zh-CN": "SANA-WM通过混合注意力、双摄像头分支等方法实现高效视频生成，达到工业级质量。",
        "ja": "SANA-WMは、ハイブリッドアテンションや二段階生成などにより、計算要件を大幅に削減した2.6Bパラメータの効率的な世界モデルである。",
        "ko": "SANA-WM은 2.6B 매개변수 세계 모델로 고해상도 720p 영상을 생성하며, 하이브리드 어텐션 및 두 단계 생성을 통해 계산 요구사항을 크게 줄입니다.",
        "es": "SANA-WM is an efficient world model that generates high-fidelity videos with precise camera control and reduced computational needs.",
        "de": "SANA-WM ist ein effizientes 2,6B-Parameter-Weltmodell, das hochfidelere 720p-Videos mit präziser Kamera-Steuerung generiert."
      }
    },
    {
      "arxivId": "2605.14333",
      "title": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation",
      "summary": "InsightTok improves discrete visual tokenization for better text and face reconstruction through content-aware perceptual losses, enhancing autoregressive image generation quality.",
      "authors": [
        "Yang Yue",
        "Fangyun Wei",
        "Tianyu He",
        "Jinjing Zhao",
        "Zanlin Ni",
        "Zeyu Liu"
      ],
      "organization": {
        "_id": "69719700e3846c07669d13ee",
        "name": "Tsinghua-LeapLab",
        "fullname": "Tsinghua-LeapLab",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/63987ffb2ceb55aabe0852f3/hflTWNTGxeJx83xNkYrDB.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 31,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14333.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14333",
      "pdfUrl": "https://arxiv.org/pdf/2605.14333.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14333",
      "githubRepo": "https://github.com/LeapLabTHU/InsightTok",
      "githubStars": 30,
      "keywords": [
        "discrete visual tokenization",
        "autoregressive generators",
        "tokenizer",
        "discrete-tokenizer objectives",
        "text legibility",
        "facial fidelity"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "InsightTok：改进离散标记化中的文本和人脸保真度",
      "summary_zh": "通过内容感知的感知损失提升离散视觉标记化，改善自回归图像生成的文本和人脸重建质量。",
      "title_i18n": {
        "en": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation",
        "zh-CN": "InsightTok：改进离散标记化中的文本和人脸保真度",
        "ja": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation",
        "ko": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation",
        "es": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation",
        "de": "InsightTok: Improving Text and Face Fidelity in Discrete Tokenization for Autoregressive Image Generation"
      },
      "summary_i18n": {
        "en": "InsightTok improves discrete visual tokenization for better text and face reconstruction through content-aware perceptual losses, enhancing autoregressive image generation quality.",
        "zh-CN": "通过内容感知的感知损失提升离散视觉标记化，改善自回归图像生成的文本和人脸重建质量。",
        "ja": "InsightTokは、コンテンツに応じた知覚損失を用いて、テキストと顔の忠実度を向上させ、自己回帰画像生成の品質を向上させます。",
        "ko": "InsightTok은 내용 인식 감각 손실을 통해 텍스트 및 얼굴 재구성을 개선하여 자동 회귀 이미지 생성의 품질을 높입니다.",
        "es": "InsightTok improves discrete visual tokenization for better text and face reconstruction through content-aware perceptual losses, enhancing autoregressive image generation quality.",
        "de": "InsightTok verbessert die diskrete visuelle Tokenisierung für bessere Text- und Gesichtsrekonstruktion durch inhaltsbewusste perceptuelle Verluste, was die Qualität der autoregressiven Bildgenerierung erhöht."
      }
    },
    {
      "arxivId": "2605.15980",
      "title": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization",
      "summary": "Flash-GRPO improves training efficiency for video diffusion models by addressing temporal variance and gradient inconsistency through iso-temporal grouping and temporal gradient rectification.",
      "authors": [
        "Xiaoxuan He",
        "Siming Fu",
        "Zeyue Xue",
        "Weijie Wang",
        "Ruizhe He",
        "Yuming Li"
      ],
      "organization": null,
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 31,
      "comments": 0,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15980.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15980",
      "pdfUrl": "https://arxiv.org/pdf/2605.15980.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15980",
      "githubRepo": "https://github.com/Shredded-Pork/Flash-GRPO",
      "githubStars": 28,
      "keywords": [
        "Group Relative Policy Optimization",
        "video diffusion models",
        "parametered model",
        "sliding window subsampling",
        "full trajectory training",
        "single-step training framework"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Flash-GRPO：通过单步策略优化实现视频扩散的高效对齐",
      "summary_zh": "使用时序分组和梯度校正提升视频扩散模型训练效率",
      "title_i18n": {
        "en": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization",
        "zh-CN": "Flash-GRPO：通过单步策略优化实现视频扩散的高效对齐",
        "ja": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization",
        "ko": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization",
        "es": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization",
        "de": "Flash-GRPO: Efficient Alignment for Video Diffusion via One-Step Policy Optimization"
      },
      "summary_i18n": {
        "en": "Flash-GRPO improves training efficiency for video diffusion models by addressing temporal variance and gradient inconsistency through iso-temporal grouping and temporal gradient rectification.",
        "zh-CN": "使用时序分组和梯度校正提升视频扩散模型训练效率",
        "ja": "Flash-GRPOは、等時的グループ化と時間勾配修正により、動画拡散モデルのトレーニング効率を向上させます。",
        "ko": "Flash-GRPO는 동일 시간 그룹화와 시간적 기울기 교정으로 시간적 변동성과 기울기 불일치를 해결하여 비디오 확산 모델의 학습 효율성을 개선합니다.",
        "es": "Flash-GRPO improves training efficiency for video diffusion models by addressing temporal variance and gradient inconsistency through iso-temporal grouping and temporal gradient rectification.",
        "de": "Flash-GRPO steigert die Trainingseffizienz für Videodiffusionsmodelle durch iso-temporale Gruppierung und zeitliche Gradientenkorrektur, um zeitliche Varianz und Gradienteninkonsistenz zu adressieren."
      }
    },
    {
      "arxivId": "2605.06554",
      "title": "Long Context Pre-Training with Lighthouse Attention",
      "summary": "Lighthouse Attention enables efficient training of causal transformers at long sequences by using hierarchical selection-based attention that reduces computational complexity while maintaining model performance.",
      "authors": [
        "Bowen Peng",
        "Subho Ghosh",
        "Jeffrey Quesnelle"
      ],
      "organization": {
        "_id": "643b858ba856622f9790cc66",
        "name": "NousResearch",
        "fullname": "NousResearch",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6317aade83d8d2fd903192d9/tPLjYEeP6q1w0j_G2TJG_.png"
      },
      "publishedAt": "2026-05-07T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 27,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.06554.png",
      "arxivUrl": "https://arxiv.org/abs/2605.06554",
      "pdfUrl": "https://arxiv.org/pdf/2605.06554.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.06554",
      "githubRepo": "https://github.com/ighoshsubho/lighthouse-attention",
      "githubStars": 35,
      "keywords": [
        "scaled dot-product attention",
        "hierarchical attention",
        "causal transformers",
        "gradient-free",
        "sequence length",
        "attention mechanism"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "基于灯塔注意力的长上下文预训练",
      "summary_zh": "使用灯塔注意力提升因果Transformer在长序列上的训练效率，降低计算复杂度并保持模型性能。",
      "title_i18n": {
        "en": "Long Context Pre-Training with Lighthouse Attention",
        "zh-CN": "基于灯塔注意力的长上下文预训练",
        "ja": "Long Context Pre-Training with Lighthouse Attention",
        "ko": "Long Context Pre-Training with Lighthouse Attention",
        "es": "Long Context Pre-Training with Lighthouse Attention",
        "de": "Long Context Pre-Training with Lighthouse Attention"
      },
      "summary_i18n": {
        "en": "Lighthouse Attention enables efficient training of causal transformers at long sequences by using hierarchical selection-based attention that reduces computational complexity while maintaining model performance.",
        "zh-CN": "使用灯塔注意力提升因果Transformer在长序列上的训练效率，降低计算复杂度并保持模型性能。",
        "ja": "Lighthouse Attentionは、計算複雑性を低減しながらモデル性能を維持し、長文シーケンスでの効率的なトレーニングを可能にします。",
        "ko": "Lighthouse Attention는 계층적 선택 기반 주의를 사용하여 장거리 시퀀스에서 인과 트랜스포머의 효율적인 학습을 가능하게 합니다.",
        "es": "Lighthouse Attention enables efficient training of causal transformers at long sequences by using hierarchical selection-based attention that reduces computational complexity while maintaining model performance.",
        "de": "Lighthouse Attention ermöglicht effizientes Training von kausalen Transformers bei langen Sequenzen durch hierarchische Auswahl- basierte Aufmerksamkeit, die Rechenkomplexität reduziert, ohne Modellleistung zu beeinträchtigen."
      }
    },
    {
      "arxivId": "2605.15529",
      "title": "Process Rewards with Learned Reliability",
      "summary": "BetaPRM introduces a distributional approach to process reward models that predicts both success probabilities and prediction reliability, enabling adaptive computation allocation that reduces token usage while maintaining accuracy.",
      "authors": [
        "Jinyuan Li",
        "Langlin Huang",
        "Chengsong Huang",
        "Shaoyang Xu",
        "Donghong Cai",
        "Yuyi Yang"
      ],
      "organization": {
        "_id": "670035f24055c4569f7dd024",
        "name": "HINT-lab",
        "fullname": "Huang's INTelligence lab",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/64efbf39b3610349e84db417/tbNZtAX3vJeGo2Rag_7ZN.png"
      },
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-20T00:00:00.000Z",
      "upvotes": 29,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15529.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15529",
      "pdfUrl": "https://arxiv.org/pdf/2605.15529.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15529",
      "githubRepo": "https://github.com/JinYuanLi0012/Beta-Binomial-PRM",
      "githubStars": 2,
      "keywords": [
        "Process Reward Models",
        "BetaPRM",
        "distributional PRM",
        "Beta belief",
        "Beta-Binomial likelihood",
        "Monte Carlo continuations"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "用学习的可靠性处理奖励模型",
      "summary_zh": "BetaPRM通过预测成功概率和预测可靠性，实现自适应计算分配，减少token使用量同时保持准确性。",
      "title_i18n": {
        "en": "Process Rewards with Learned Reliability",
        "zh-CN": "用学习的可靠性处理奖励模型",
        "ja": "Process Rewards with Learned Reliability",
        "ko": "Process Rewards with Learned Reliability",
        "es": "Process Rewards with Learned Reliability",
        "de": "Process Rewards with Learned Reliability"
      },
      "summary_i18n": {
        "en": "BetaPRM introduces a distributional approach to process reward models that predicts both success probabilities and prediction reliability, enabling adaptive computation allocation that reduces token usage while maintaining accuracy.",
        "zh-CN": "BetaPRM通过预测成功概率和预测可靠性，实现自适应计算分配，减少token使用量同时保持准确性。",
        "ja": "BetaPRMは、成功確率と予測信頼性を予測する分布的アプローチを導入し、トークン使用量を削減します。",
        "ko": "BetaPRM은 성공 확률과 예측 신뢰도를 예측하는 분포 접근법을 도입하여 적응형 계산 할당을 가능하게 하여 토큰 사용을 줄이고 정확도를 유지합니다.",
        "es": "BetaPRM introduces a distributional approach to process reward models that predicts both success probabilities and prediction reliability, enabling adaptive computation allocation that reduces token usage while maintaining accuracy.",
        "de": "BetaPRM führt einen verteilten Ansatz zur Verarbeitung von Reward-Modellen ein, der Erfolgs- und Zuverlässigkeitswahrscheinlichkeiten vorhersagt, um adaptive Berechnungsressourcen zu ermöglichen und Token-Nutzung zu reduzieren."
      }
    },
    {
      "arxivId": "2605.13062",
      "title": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling",
      "summary": "Recent image editing models have achieved remarkable progress in instruction following, multimodal understanding, and complex visual editing. However, existing benchmarks often fail to faithfully reflect human judgment, especially for strong frontier models, due to limited task difficulty and coarse-grained evaluation protocols. In parallel, reward models have become increasingly important for RL-based image editing optimization, yet existing reward model benchmarks still rely on unrealistic evaluation settings that deviate from practical RL scenarios. These limitations hinder reliable assessment of both image editing models and reward models. To address these challenges, we introduce Edit-Compass and EditReward-Compass, a unified evaluation suite for image editing and reward modeling. Edit-Compass contains 2,388 carefully annotated instances spanning six progressively challenging task categories, covering capabilities such as world knowledge reasoning, visual reasoning, and multi-image editing. Beyond broad task coverage, Edit-Compass adopts a fine-grained multidimensional evaluation framework based on structured reasoning and carefully designed scoring rubrics. In parallel, EditReward-Compass contains 2,251 preference pairs that simulate realistic reward modeling scenarios during RL optimization.",
      "authors": [
        "Xuehai Bai",
        "Yang Shi",
        "Yi-Fan Zhang",
        "Xuanyu Zhu",
        "Yuran Wang",
        "Yifan Dai"
      ],
      "organization": null,
      "publishedAt": "2026-05-13T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 32,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.13062.png",
      "arxivUrl": "https://arxiv.org/abs/2605.13062",
      "pdfUrl": "https://arxiv.org/pdf/2605.13062.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.13062",
      "githubRepo": "https://github.com/bxhsort/Edit-Compass-and-EditReward-Compass",
      "githubStars": 13,
      "keywords": [
        "image editing models",
        "reward models",
        "reinforcement learning",
        "evaluation protocols",
        "benchmarking",
        "preference pairs"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Edit-Compass 与 EditReward-Compass：图像编辑与奖励建模的统一基准",
      "summary_zh": "构建 Edit-Compass 和 EditReward-Compass 基准，涵盖 2,388 个标注实例和 2,251 对偏好数据，支持细粒度多维评估与真实 RL 场景模拟。",
      "title_i18n": {
        "en": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling",
        "zh-CN": "Edit-Compass 与 EditReward-Compass：图像编辑与奖励建模的统一基准",
        "ja": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling",
        "ko": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling",
        "es": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling",
        "de": "Edit-Compass & EditReward-Compass: A Unified Benchmark for Image Editing and Reward Modeling"
      },
      "summary_i18n": {
        "en": "Recent image editing models have achieved remarkable progress in instruction following, multimodal understanding, and complex visual editing. However, existing benchmarks often fail to faithfully reflect human judgment, especially for strong frontier models, due to limited task difficulty and coarse-grained evaluation protocols. In parallel, reward models have become increasingly important for RL-based image editing optimization, yet existing reward model benchmarks still rely on unrealistic evaluation settings that deviate from practical RL scenarios. These limitations hinder reliable assessment of both image editing models and reward models. To address these challenges, we introduce Edit-Compass and EditReward-Compass, a unified evaluation suite for image editing and reward modeling. Edit-Compass contains 2,388 carefully annotated instances spanning six progressively challenging task categories, covering capabilities such as world knowledge reasoning, visual reasoning, and multi-image editing. Beyond broad task coverage, Edit-Compass adopts a fine-grained multidimensional evaluation framework based on structured reasoning and carefully designed scoring rubrics. In parallel, EditReward-Compass contains 2,251 preference pairs that simulate realistic reward modeling scenarios during RL optimization.",
        "zh-CN": "构建 Edit-Compass 和 EditReward-Compass 基准，涵盖 2,388 个标注实例和 2,251 对偏好数据，支持细粒度多维评估与真实 RL 场景模拟。",
        "ja": "Edit-CompassとEditReward-Compassは、画像編集と報酬モデリングの統合評価ベンチマークです。",
        "ko": "최근 이미지 편집 모델은 명령 수행, 다중 모달 이해, 복잡한 시각 편집에서 놀라운 진보를 이루었습니다. 그러나 기존 벤치마크는 작업 난이도 한계와 거친 평가 프로토콜로 인해 인간 판단을 충실히 반영하지 못합니다. 동시에 보상 모델은 RL 기반 이미지 편집 최적화에 점점 더 중요해졌지만, 실제 RL 시나리오와 다른 비현실적인 평가 설정에 의존하고 있습니다. 이러한 제한은 이미지 편집 모델과 보상 모델 모두에 대한 신뢰할 수 있는 평가를 방해합니다. 이러한 문제를 해결하기 위해 우리는 Edit-Compass와 EditReward-Compass를 도입합니다. 이는 이미지 편집과 보상 모델링을 위한 통합 평가 세트입니다. Edit-Compass는 세계 지식 추론, 시각적 추론 및 다이미지 편집과 같은 능력을 포함하는 6개의 점진적으로 어려워지는 작업 카테고리에 걸쳐 2,388개의 철저히 주석된 인스턴스를 포함합니다. 넓은 작업 커버리지를 넘어서, Edit-Compass는 구조화된 추론과 철저히 설계된 평가 기준을 기반으로 한 세부적인 다차원 평가 프레임워크를 채택합니다. 동시에 EditReward-Compass는 RL 최적화 중 현실적인 보상 모델링 시나리오를 시뮬레이션하는 2,251개의 선호 쌍을 포함합니다.",
        "es": "Recent image editing models have achieved remarkable progress in instruction following, multimodal understanding, and complex visual editing. However, existing benchmarks often fail to faithfully reflect human judgment, especially for strong frontier models, due to limited task difficulty and coarse-grained evaluation protocols. In parallel, reward models have become increasingly important for RL-based image editing optimization, yet existing reward model benchmarks still rely on unrealistic evaluation settings that deviate from practical RL scenarios. These limitations hinder reliable assessment of both image editing models and reward models. To address these challenges, we introduce Edit-Compass and EditReward-Compass, a unified evaluation suite for image editing and reward modeling. Edit-Compass contains 2,388 carefully annotated instances spanning six progressively challenging task categories, covering capabilities such as world knowledge reasoning, visual reasoning, and multi-image editing. Beyond broad task coverage, Edit-Compass adopts a fine-grained multidimensional evaluation framework based on structured reasoning and carefully designed scoring rubrics. In parallel, EditReward-Compass contains 2,251 preference pairs that simulate realistic reward modeling scenarios during RL optimization.",
        "de": "Neue Bildbearbeitungsmodelle haben Fortschritte in Anweisungsfolge, Multimodalität und komplexer visueller Bearbeitung erzielt. Allerdings fehlen vorhandene Benchmarks eine genaue Wiedergabe menschlicher Urteile, besonders für starke Modelle, aufgrund begrenzter Aufgabenschwierigkeit und grober Bewertungsmethoden. Gleichzeitig sind Reward-Modelle für RL-basierte Bildbearbeitung immer wichtiger geworden, doch bestehende Benchmarking-Methoden verwenden unrealistische Bewertungsszenarien, die von praktischen RL-Szenarien abweichen. Diese Einschränkungen behindern die zuverlässige Bewertung von Bildbearbeitungs- und Reward-Modellen. Um diese Herausforderungen zu lösen, führen wir Edit-Compass und EditReward-Compass ein, eine einheitliche Bewertungssuite für Bildbearbeitung und Reward-Modellierung. Edit-Compass enthält 2.388 sorgfältig annotierte Instanzen über sechs zunehmend schwierige Aufgabenbereiche, die Fähigkeiten wie Weltwissen-Reasoning, visuelles Reasoning und mehrbildliche Bearbeitung abdecken. Neben umfassender Aufgabenabdeckung verwendet Edit-Compass ein fein abgestimmtes multidimensionales Bewertungssystem basierend auf strukturierter Reasoning und sorgfältig gestalteten Bewertungsrubriken. Parallel dazu enthält EditReward-Compass 2.251 Präferenzpaare, die realistische Reward-Modellierungsszenarien während der RL-Optimierung simulieren."
      }
    },
    {
      "arxivId": "2605.17260",
      "title": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs",
      "summary": "LiteFrame, a lightweight video encoder with Compressed Token Distillation training method, reduces latency and increases frame processing capacity for long-form video understanding in Video LLMs while maintaining accuracy.",
      "authors": [
        "Jihwan Kim",
        "Nikhil Parthasarathy",
        "Danfeng Qin",
        "Junhwa Hur",
        "Deqing Sun",
        "Bohyung Han"
      ],
      "organization": {
        "_id": "60f6cbb2852126bac698c89e",
        "name": "deepmind",
        "fullname": "Deepmind",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/1638956859875-5f1158120c833276f61f1a84.jpeg"
      },
      "publishedAt": "2026-05-17T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 20,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.17260.png",
      "arxivUrl": "https://arxiv.org/abs/2605.17260",
      "pdfUrl": "https://arxiv.org/pdf/2605.17260.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.17260",
      "githubRepo": "https://github.com/jjihwan/LiteFrame",
      "githubStars": 11,
      "keywords": [
        "Video Large Language Models",
        "visual-token context length",
        "post-hoc token reduction",
        "vision encoder",
        "Compressed Token Distillation",
        "token distillation"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "LiteFrame：高效视觉编码器实现视频LLM的帧扩展",
      "summary_zh": "LiteFrame通过压缩令牌蒸馏方法提升视频LLM的帧处理能力并降低延迟，同时保持准确性。",
      "title_i18n": {
        "en": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs",
        "zh-CN": "LiteFrame：高效视觉编码器实现视频LLM的帧扩展",
        "ja": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs",
        "ko": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs",
        "es": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs",
        "de": "LiteFrame: Efficient Vision Encoders Unlock Frame Scaling in Video LLMs"
      },
      "summary_i18n": {
        "en": "LiteFrame, a lightweight video encoder with Compressed Token Distillation training method, reduces latency and increases frame processing capacity for long-form video understanding in Video LLMs while maintaining accuracy.",
        "zh-CN": "LiteFrame通过压缩令牌蒸馏方法提升视频LLM的帧处理能力并降低延迟，同时保持准确性。",
        "ja": "LiteFrameは、圧縮トークン蒸留法を用いて、動画LLMにおけるフレームスケーリングを効率的に実現します。",
        "ko": "LiteFrame는 압축 토큰 증류 학습 방법을 가진 경량 비디오 인코더로, 비디오 LLM에서 긴 형식의 비디오 이해를 위한 지연 시간을 줄이고 프레임 처리 용량을 늘립니다.",
        "es": "LiteFrame, a lightweight video encoder with Compressed Token Distillation training method, reduces latency and increases frame processing capacity for long-form video understanding in Video LLMs while maintaining accuracy.",
        "de": "LiteFrame, ein leichtgewichtiger Video-Encoder mit Compressed Token Distillation-Trainingsmethode, reduziert Latenz und erhöht die Frame-Verarbeitungskapazität für langes Video-Verständnis in Video-LLMs, während Genauigkeit erhalten bleibt."
      }
    },
    {
      "arxivId": "2605.15256",
      "title": "ReactiveGWM: Steering NPC in Reactive Game World Models",
      "summary": "ReactiveGWM enables dynamic player-NPC interactions in game worlds by decoupling player controls from NPC behaviors through diffusion models with cross-attention modules for game-agnostic strategy transfer.",
      "authors": [
        "Zeqing Wang",
        "Danze Chen",
        "Zhaohu Xing",
        "Zizhao Tong",
        "Yinhan Zhang",
        "Xingyi Yang"
      ],
      "organization": null,
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 24,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15256.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15256",
      "pdfUrl": "https://arxiv.org/pdf/2605.15256.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15256",
      "githubRepo": "https://github.com/INV-WZQ/ReactiveGWM",
      "githubStars": 36,
      "keywords": [
        "diffusion models",
        "cross-attention modules",
        "game-agnostic representation",
        "zero-shot strategy transfer",
        "player controls",
        "NPC behaviors"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "ReactiveGWM：在反应式游戏世界模型中引导NPC",
      "summary_zh": "通过扩散模型与交叉注意力模块解耦玩家控制与NPC行为，实现跨游戏策略迁移",
      "title_i18n": {
        "en": "ReactiveGWM: Steering NPC in Reactive Game World Models",
        "zh-CN": "ReactiveGWM：在反应式游戏世界模型中引导NPC",
        "ja": "ReactiveGWM: Steering NPC in Reactive Game World Models",
        "ko": "ReactiveGWM: Steering NPC in Reactive Game World Models",
        "es": "ReactiveGWM: Steering NPC in Reactive Game World Models",
        "de": "ReactiveGWM: Steering NPC in Reactive Game World Models"
      },
      "summary_i18n": {
        "en": "ReactiveGWM enables dynamic player-NPC interactions in game worlds by decoupling player controls from NPC behaviors through diffusion models with cross-attention modules for game-agnostic strategy transfer.",
        "zh-CN": "通过扩散模型与交叉注意力模块解耦玩家控制与NPC行为，实现跨游戏策略迁移",
        "ja": "ReactiveGWMは、ゲームに依存しない戦略転送を可能にするクロスアテンションモジュールを備えた拡散モデルで、NPCの動的制御を実現します。",
        "ko": "ReactiveGWM은 게임 간 전략 전달을 위한 크로스 어텐션 모듈이 포함된 확산 모델을 통해 플레이어 조작을 NPC 행동에서 분리함으로써 게임 세계에서 동적 플레이어-NPC 상호작용을 가능하게 합니다.",
        "es": "ReactiveGWM enables dynamic player-NPC interactions in game worlds by decoupling player controls from NPC behaviors through diffusion models with cross-attention modules for game-agnostic strategy transfer.",
        "de": "ReactiveGWM ermöglicht dynamische Spieler-NPC-Interaktionen in Spielweltmodellen durch Entkoppelung von Spielerkontrollen von NPC-Verhalten mittels Diffusionsmodellen mit Kreuz-Attention-Modulen für spielfreie Strategietransfer."
      }
    },
    {
      "arxivId": "2605.14386",
      "title": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning",
      "summary": "The Darwin Family framework enables training-free evolutionary merging of large language models through gradient-free weight-space recombination, achieving superior reasoning performance without additional training.",
      "authors": [
        "Taebong Kim",
        "Youngsik Hong",
        "Minsik Kim",
        "Sunyoung Choi",
        "Jaewon Jang",
        "Junghoon Shin"
      ],
      "organization": {
        "_id": "699976ab4a856643b7429675",
        "name": "FINAL-Bench",
        "fullname": "FINAL_Bench",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6905bc786cb49b1f11d32728/VZmuKH-liifeL2GCXlwka.jpeg"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 56,
      "comments": 3,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14386.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14386",
      "pdfUrl": "https://arxiv.org/pdf/2605.14386.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14386",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "evolutionary merging",
        "gradient-free weight-space recombination",
        "merge genome",
        "MRI-Trust Fusion",
        "trust parameter",
        "Architecture Mapper"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "达尔文家族：无需训练的MRI信任加权进化融合",
      "summary_zh": "通过无梯度权重空间重组实现大语言模型推理能力的无需训练进化融合，提升推理性能",
      "title_i18n": {
        "en": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning",
        "zh-CN": "达尔文家族：无需训练的MRI信任加权进化融合",
        "ja": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning",
        "ko": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning",
        "es": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning",
        "de": "Darwin Family: MRI-Trust-Weighted Evolutionary Merging for Training-Free Scaling of Language-Model Reasoning"
      },
      "summary_i18n": {
        "en": "The Darwin Family framework enables training-free evolutionary merging of large language models through gradient-free weight-space recombination, achieving superior reasoning performance without additional training.",
        "zh-CN": "通过无梯度权重空间重组实现大语言模型推理能力的无需训练进化融合，提升推理性能",
        "ja": "Darwin Familyは、トレーニングなしで大規模言語モデルを進化的に統合し、優れた推論性能を達成します。",
        "ko": "Darwin Family 프레임워크는 추가 학습 없이 대규모 언어 모델의 학습 없는 진화적 병합을 가능하게 하며, 그래디언트 없는 가중치 공간 재조합을 통해 우수한 추론 성능을 달성합니다.",
        "es": "The Darwin Family framework enables training-free evolutionary merging of large language models through gradient-free weight-space recombination, achieving superior reasoning performance without additional training.",
        "de": "Der Darwin Family-Framework ermöglicht trainingfreies evolutionäres Zusammenführen großer Sprachmodelle durch gradientfreie Gewichtsraum-Rekombination und erreicht bessere Reasoning-Performance ohne zusätzliche Trainings."
      }
    },
    {
      "arxivId": "2605.11550",
      "title": "The DAWN of World-Action Interactive Models",
      "summary": "World-Action Interactive Models (WAIMs) jointly model scene evolution and actions through recursive refinement, enabling effective long-horizon planning in autonomous driving scenarios.",
      "authors": [
        "Hongbo Lu",
        "Liang Yao",
        "Chenghao He",
        "Haoyu Wang",
        "Xiang Gu",
        "Xianfei Li"
      ],
      "organization": {
        "_id": "6a056ac51ccc9ecd592d8241",
        "name": "COWARobot",
        "fullname": "COWARobot",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/671618d2c6f6570d4f513004/Vc-gBeokccKOOhMP1GBsZ.png"
      },
      "publishedAt": "2026-05-12T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 22,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.11550.png",
      "arxivUrl": "https://arxiv.org/abs/2605.11550",
      "pdfUrl": "https://arxiv.org/pdf/2605.11550.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.11550",
      "githubRepo": "https://github.com/COOWAI/DAWN",
      "githubStars": 29,
      "keywords": [
        "World Action Models",
        "World-Action Interactive Models",
        "DAWN",
        "latent generative baseline",
        "semantic latent space",
        "World Predictor"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "DAWN世界-动作交互模型",
      "summary_zh": "提出WAIM模型，通过递归优化联合建模场景演化与动作，实现自动驾驶长周期规划的有效性",
      "title_i18n": {
        "en": "The DAWN of World-Action Interactive Models",
        "zh-CN": "DAWN世界-动作交互模型",
        "ja": "The DAWN of World-Action Interactive Models",
        "ko": "The DAWN of World-Action Interactive Models",
        "es": "El DAWN de los Modelos Interactivos Mundo-Acción",
        "de": "The DAWN of World-Action Interactive Models"
      },
      "summary_i18n": {
        "en": "World-Action Interactive Models (WAIMs) jointly model scene evolution and actions through recursive refinement, enabling effective long-horizon planning in autonomous driving scenarios.",
        "zh-CN": "提出WAIM模型，通过递归优化联合建模场景演化与动作，实现自动驾驶长周期规划的有效性",
        "ja": "World-Action Interactive Models (WAIMs) は、再帰的改善を通じてシーンの進化と行動を共同でモデル化し、自律走行シナリオでの効果的な長期計画を可能にします。",
        "ko": "World-Action Interactive Models (WAIMs)는 시나리오 진화와 행동을 함께 모델링하여 자율 주행 환경에서 효과적인 장기 계획을 가능하게 합니다.",
        "es": "Los Modelos Interactivos Mundo-Acción (WAIMs) modelan conjuntamente la evolución de la escena y las acciones mediante refinamiento recursivo, permitiendo un planeamiento efectivo a largo plazo en escenarios de conducción autónoma.",
        "de": "World-Action Interactive Models (WAIMs) modellieren Szenenentwicklung und Aktionen gemeinsam durch rekursive Verbesserung, was effektives langfristiges Planen in autonomen Fahrzeuganwendungen ermöglicht."
      }
    },
    {
      "arxivId": "2605.00180",
      "title": "RouteProfile: Elucidating the Design Space of LLM Profiles for Routing",
      "summary": "LLM profiling design significantly impacts routing performance, with structured profiles and query-level signals demonstrating superior reliability and generalization compared to flat profiles and domain-level signals.",
      "authors": [
        "Jingjun Xu",
        "Hongji Pu",
        "Tao Feng",
        "Haozhen Zhang",
        "Jiaxuan You",
        "Ge Liu"
      ],
      "organization": {
        "_id": "65448bef5b5d9185ba3202b9",
        "name": "UIUC-CS",
        "fullname": "University of Illinois at Urbana-Champaign",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/65448b21fcb96b8b48733729/ycqcXFayMTTD_KpE37067.jpeg"
      },
      "publishedAt": "2026-04-30T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 30,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.00180.png",
      "arxivUrl": "https://arxiv.org/abs/2605.00180",
      "pdfUrl": "https://arxiv.org/pdf/2605.00180.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.00180",
      "githubRepo": "https://github.com/ulab-uiuc/RouteProfile",
      "githubStars": 7,
      "keywords": [
        "large language models",
        "LLM routing",
        "LLM profiles",
        "router mechanism design",
        "structured information integration",
        "RouteProfile"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "RouteProfile：解析LLM路由配置文件的设计空间",
      "summary_zh": "设计结构化LLM配置文件提升路由性能，优于平面配置文件和领域级信号。",
      "title_i18n": {
        "en": "RouteProfile: Elucidating the Design Space of LLM Profiles for Routing",
        "zh-CN": "RouteProfile：解析LLM路由配置文件的设计空间",
        "ja": "RouteProfile: Elucidating the Design Space of LLM Profiles for Routing",
        "ko": "RouteProfile: Elucidating the Design Space of LLM Profiles for Routing",
        "es": "RouteProfile: Clarificando el Espacio de Diseño de Perfiles de LLM para Ruteo",
        "de": "RouteProfile: Elucidating the Design Space of LLM Profiles for Routing"
      },
      "summary_i18n": {
        "en": "LLM profiling design significantly impacts routing performance, with structured profiles and query-level signals demonstrating superior reliability and generalization compared to flat profiles and domain-level signals.",
        "zh-CN": "设计结构化LLM配置文件提升路由性能，优于平面配置文件和领域级信号。",
        "ja": "LLMプロファイリング設計はルーティング性能に大きく影響し、構造化されたプロファイルが非構造化プロファイルよりも優れた信頼性と汎化能力を示します。",
        "ko": "LLM 프로파일 설계는 라우팅 성능에 큰 영향을 미치며, 구조화된 프로파일이 평평한 프로파일보다 더 신뢰성과 일반성을 보입니다.",
        "es": "El diseño de perfiles de LLM tiene un impacto significativo en el rendimiento de ruteo, con perfiles estructurados y señales a nivel de consulta que demuestran mayor fiabilidad y generalización que perfiles planos y señales a nivel de dominio.",
        "de": "Die Gestaltung von LLM-Profilen beeinflusst die Routenleistung erheblich, wobei strukturierte Profile und abfrageweise Signale eine bessere Zuverlässigkeit und Verallgemeinerung zeigen als flache Profile und domainweise Signale."
      }
    },
    {
      "arxivId": "2605.12825",
      "title": "Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion",
      "summary": "Orthrus is a dual-architecture framework that combines autoregressive LLMs with diffusion models to achieve fast parallel token generation while maintaining exact inference fidelity through shared KV caches and consensus mechanisms.",
      "authors": [
        "Chien Van Nguyen",
        "Chaitra Hegde",
        "Van Cuong Pham",
        "Ryan A. Rossi",
        "Franck Dernoncourt",
        "Thien Huu Nguyen"
      ],
      "organization": null,
      "publishedAt": "2026-05-12T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 11,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.12825.png",
      "arxivUrl": "https://arxiv.org/abs/2605.12825",
      "pdfUrl": "https://arxiv.org/pdf/2605.12825.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.12825",
      "githubRepo": "https://github.com/chiennv2000/orthrus",
      "githubStars": 324,
      "keywords": [
        "autoregressive Large Language Models",
        "diffusion models",
        "parallel token generation",
        "Transformer",
        "Key-Value cache",
        "consensus mechanism"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Orthrus：通过双视角扩散实现高效并行标记生成",
      "summary_zh": "Orthrus结合自回归大模型与扩散模型，利用共享KV缓存和共识机制实现快速并行生成，保持精确推理精度。",
      "title_i18n": {
        "en": "Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion",
        "zh-CN": "Orthrus：通过双视角扩散实现高效并行标记生成",
        "ja": "Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion",
        "ko": "Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion",
        "es": "Orthrus: Generación de Tokens Paralelos Eficientes en Memoria mediante Difusión de Vista Dual",
        "de": "Orthrus: Memory-Efficient Parallel Token Generation via Dual-View Diffusion"
      },
      "summary_i18n": {
        "en": "Orthrus is a dual-architecture framework that combines autoregressive LLMs with diffusion models to achieve fast parallel token generation while maintaining exact inference fidelity through shared KV caches and consensus mechanisms.",
        "zh-CN": "Orthrus结合自回归大模型与扩散模型，利用共享KV缓存和共识机制实现快速并行生成，保持精确推理精度。",
        "ja": "Orthrus は、共有 KV キャッシュとコンセンサスメカニズムを用いて、正確な推論精度を維持しながら高速な並列トークン生成を実現する二重アーキテクチャフレームワークです。",
        "ko": "Orthrus는 공유 KV 캐시와 합의 메커니즘을 통해 정확한 추론 정확도를 유지하면서 빠른 병렬 토큰 생성을 위한 이중 아키텍처 프레임워크입니다.",
        "es": "Orthrus es un marco de arquitectura dual que combina LLMs autoregresivos con modelos de difusión para lograr una generación rápida de tokens paralelos manteniendo la fidelidad de inferencia exacta mediante cachés KV compartidas y mecanismos de consenso.",
        "de": "Orthrus ist ein Dual-Architektur-Framework, das autoregressive LLMs mit Diffusionsmodellen kombiniert, um schnelle parallele Token-Generierung zu erreichen, während exakte Inferenzgenauigkeit durch geteilte KV-Caches gewahrt bleibt."
      }
    },
    {
      "arxivId": "2605.15040",
      "title": "Orchard: An Open-Source Agentic Modeling Framework",
      "summary": "Orchard is an open-source framework for scalable agentic modeling that enables training diverse autonomous agents through specialized recipes for coding, GUI navigation, and personal assistance tasks.",
      "authors": [
        "Baolin Peng",
        "Wenlin Yao",
        "Qianhui Wu",
        "Hao Cheng",
        "Xiao Yu",
        "Rui Yang"
      ],
      "organization": {
        "_id": "68151d0f51add3813f3f7d1b",
        "name": "MicrosoftResearch",
        "fullname": "Microsoft Research",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6529a4f2f1205983224fa513/PeuVr7jSuJflmDBBGxoDX.png"
      },
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 18,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15040.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15040",
      "pdfUrl": "https://arxiv.org/pdf/2605.15040.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15040",
      "githubRepo": "https://github.com/microsoft/Orchard",
      "githubStars": 48,
      "keywords": [
        "agentic modeling",
        "large language models",
        "planning",
        "reasoning",
        "tool use",
        "multi-turn interaction"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "Orchard：一个开源的代理建模框架",
      "summary_zh": "开发了一个开源代理建模框架，通过专用配方实现代码、GUI导航和个性化任务的自主代理训练。",
      "title_i18n": {
        "en": "Orchard: An Open-Source Agentic Modeling Framework",
        "zh-CN": "Orchard：一个开源的代理建模框架",
        "ja": "Orchard: An Open-Source Agentic Modeling Framework",
        "ko": "Orchard: An Open-Source Agentic Modeling Framework",
        "es": "Orchard: Un Marco de Modelado Agente de Código Abierto",
        "de": "Orchard: An Open-Source Agentic Modeling Framework"
      },
      "summary_i18n": {
        "en": "Orchard is an open-source framework for scalable agentic modeling that enables training diverse autonomous agents through specialized recipes for coding, GUI navigation, and personal assistance tasks.",
        "zh-CN": "开发了一个开源代理建模框架，通过专用配方实现代码、GUI导航和个性化任务的自主代理训练。",
        "ja": "Orchard は、コード作成、GUIナビゲーション、個人支援タスクのための専門的なレシピを通じて、多様な自律エージェントのトレーニングを可能にするオープンソースフレームワークです。",
        "ko": "Orchard는 코드 작성, GUI 탐색 및 개인 지원 작업을 위한 전문 레시피를 통해 다양한 자율 에이전트를 훈련하는 오픈소스 프레임워크입니다.",
        "es": "Orchard es un marco de código abierto para modelado agente escalable que permite entrenar agentes autónomos diversos mediante recetas especializadas para tareas de programación, navegación de GUI y asistencia personal.",
        "de": "Orchard ist ein Open-Source-Framework für skalierbare agente Modellierung, das die Ausbildung vielfältiger autonomer Agenten durch spezialisierte Rezepte für Codierung, GUI-Navigation und persönliche Assistenz ermöglicht."
      }
    },
    {
      "arxivId": "2605.12411",
      "title": "Predicting Decisions of AI Agents from Limited Interaction through Text-Tabular Modeling",
      "summary": "AI agents can predict counterpart decisions in negotiation games by combining tabular features with LLM-based text representations and hidden states from a frozen observer model, outperforming direct prompting methods.",
      "authors": [
        "Eilam Shapira",
        "Moshe Tennenholtz",
        "Roi Reichart"
      ],
      "organization": {
        "_id": "6393322be2364bc1eea56e45",
        "name": "Technion",
        "fullname": "Technion Israel institute of technology",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/1670591001944-63926124526c29d5b5011374.jpeg"
      },
      "publishedAt": "2026-05-12T00:00:00.000Z",
      "submittedAt": "2026-05-14T00:00:00.000Z",
      "upvotes": 48,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.12411.png",
      "arxivUrl": "https://arxiv.org/abs/2605.12411",
      "pdfUrl": "https://arxiv.org/pdf/2605.12411.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.12411",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "target-adaptive text-tabular prediction",
        "tabular foundation model",
        "LLM-as-Observer",
        "few-shot prompting",
        "decision prediction",
        "bargaining games"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "通过文本-表格建模预测AI代理决策",
      "summary_zh": "结合表格特征与文本表示及冻结观察者模型隐状态，提升谈判游戏中AI代理决策预测性能",
      "title_i18n": {
        "en": "Predicting Decisions of AI Agents from Limited Interaction through Text-Tabular Modeling",
        "zh-CN": "通过文本-表格建模预测AI代理决策",
        "ja": "Predicting Decisions of AI Agents from Limited Interaction through Text-Tabular Modeling",
        "ko": "Predicting Decisions of AI Agents from Limited Interaction through Text-Tabular Modeling",
        "es": "Predecir Decisiones de Agentes de IA a Partir de Interacción Limitada mediante Modelado Texto-Tabular",
        "de": "Predicting Decisions of AI Agents from Limited Interaction through Text-Tabular Modeling"
      },
      "summary_i18n": {
        "en": "AI agents can predict counterpart decisions in negotiation games by combining tabular features with LLM-based text representations and hidden states from a frozen observer model, outperforming direct prompting methods.",
        "zh-CN": "结合表格特征与文本表示及冻结观察者模型隐状态，提升谈判游戏中AI代理决策预测性能",
        "ja": "AIエージェントは、固定観測者モデルからの隠れ状態とテキスト表現を組み合わせることで、交渉ゲームにおける相手の意思決定を予測し、直接プロンプティングより優れた結果を得ます。",
        "ko": "AI 에이전트는 텍스트 표현과 은닉 상태를 결합하여 협상 게임에서 상대방의 결정을 예측하며, 직접적인 프롬프팅 방법보다 우수합니다.",
        "es": "Los agentes de IA pueden predecir decisiones de contrapartida en juegos de negociación combinando características tabulares con representaciones de texto basadas en LLM y estados ocultos de un modelo observador congelado, superando métodos de invocación directa.",
        "de": "AI-Agenten können Entscheidungen von Gegenparteien in Verhandlungsspielen vorhersagen, indem sie tabellarische Merkmale mit LLM-basierten Textdarstellungen kombinieren und versteckte Zustände eines frozen Observer-Modells nutzen."
      }
    },
    {
      "arxivId": "2605.20025",
      "title": "AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration",
      "summary": "AutoResearchClaw is a multi-agent autonomous research system that improves scientific discovery through structured debate, self-healing execution, verifiable reporting, human collaboration, and evolutionary learning, outperforming previous systems on a benchmark while maintaining human oversight.",
      "authors": [
        "Jiaqi Liu",
        "Shi Qiu",
        "Mairui Li",
        "Bingzhou Li",
        "Haonian Ji",
        "Siwei Han"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T00:00:00.000Z",
      "submittedAt": "2026-05-20T00:00:00.000Z",
      "upvotes": 29,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.20025.png",
      "arxivUrl": "https://arxiv.org/abs/2605.20025",
      "pdfUrl": "https://arxiv.org/pdf/2605.20025.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.20025",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "multi-agent autonomous research",
        "structured multi-agent debate",
        "self-healing executor",
        "\\textsc{Pivot}/\\textsc{Refine} decision loop",
        "verifiable result reporting",
        "human-in-the-loop collaboration"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "AutoResearchClaw：基于人类-AI协作的自强化自主研究",
      "summary_zh": "通过结构化辩论和进化学习提升科学发现，优于现有系统并保持人类监督",
      "title_i18n": {
        "en": "AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration",
        "zh-CN": "AutoResearchClaw：基于人类-AI协作的自强化自主研究",
        "ja": "AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration",
        "ko": "AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration",
        "es": "AutoResearchClaw: Investigación Autónoma Auto-Reforzada con Colaboración Humano-I.A.",
        "de": "AutoResearchClaw: Self-Reinforcing Autonomous Research with Human-AI Collaboration"
      },
      "summary_i18n": {
        "en": "AutoResearchClaw is a multi-agent autonomous research system that improves scientific discovery through structured debate, self-healing execution, verifiable reporting, human collaboration, and evolutionary learning, outperforming previous systems on a benchmark while maintaining human oversight.",
        "zh-CN": "通过结构化辩论和进化学习提升科学发现，优于现有系统并保持人类监督",
        "ja": "AutoResearchClaw は、構造的な議論や自己修復実行を通じて科学的発見を向上させるマルチエージェントの自律研究システムであり、人間の監視を維持しながらベンチマークで以前のシステムを上回ります。",
        "ko": "AutoResearchClaw는 구조화된 논쟁과 인간 협업을 통해 과학적 발견을 개선하는 다중 에이전트 자율 연구 시스템입니다.",
        "es": "AutoResearchClaw es un sistema de investigación multiagente autónomo que mejora el descubrimiento científico mediante debate estructurado, ejecución auto-sanadora, informes verificables, colaboración humana y aprendizaje evolutivo, superando sistemas anteriores en una prueba mientras mantiene supervisión humana.",
        "de": "AutoResearchClaw ist ein Multi-Agenten-System für autonome Forschung, das wissenschaftliche Entdeckungen durch strukturierte Debatte, Selbstheilung, verifizierbare Berichte, menschliche Zusammenarbeit und evolutionäre Lernverfahren verbessert."
      }
    },
    {
      "arxivId": "2605.17672",
      "title": "Stop When Reasoning Converges: Semantic-Preserving Early Exit for Reasoning Models",
      "summary": "Researchers introduce PUMA, a framework that uses semantic redundancy detection to improve reasoning efficiency in large models by identifying when continued thinking provides no new insights, thus reducing computational waste while maintaining answer accuracy and reasoning quality.",
      "authors": [
        "Dehai Min",
        "Giovanni Vaccarino",
        "Huiyi Chen",
        "Yongliang Wu",
        "Gal Yona",
        "Lu Cheng"
      ],
      "organization": {
        "_id": "65c2bfbc82fc487034cdbe0e",
        "name": "UIChicago",
        "fullname": "University of Illinois Chicago",
        "avatar": "https://www.gravatar.com/avatar/36812d7b0099d7b6dfd3f48821be465f?d=retro&size=100"
      },
      "publishedAt": "2026-05-17T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 19,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.17672.png",
      "arxivUrl": "https://arxiv.org/abs/2605.17672",
      "pdfUrl": "https://arxiv.org/pdf/2605.17672.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.17672",
      "githubRepo": "https://github.com/giovanni-vaccarino/PUMA",
      "githubStars": 4,
      "keywords": [
        "Large Reasoning Models",
        "chains of thought",
        "early-exit methods",
        "answer-level signals",
        "reasoning convergence",
        "semantic redundancy"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "推理收敛时停止：语义保留的早期退出方法",
      "summary_zh": "提出PUMA框架，通过语义冗余检测实现推理模型高效计算，减少计算浪费并保持答案准确性和推理质量。",
      "title_i18n": {
        "en": "Stop When Reasoning Converges: Semantic-Preserving Early Exit for Reasoning Models",
        "zh-CN": "推理收敛时停止：语义保留的早期退出方法",
        "ja": "Stop When Reasoning Converges: Semantic-Preserving Early Exit for Reasoning Models",
        "ko": "Stop When Reasoning Converges: Semantic-Preserving Early Exit for Reasoning Models",
        "es": "Detenerse cuando el Razonamiento Converge: Salida Temprana que Preserva el Sentido para Modelos de Razonamiento",
        "de": "Stop When Reasoning Converges: Semantic-Preserving Early Exit for Reasoning Models"
      },
      "summary_i18n": {
        "en": "Researchers introduce PUMA, a framework that uses semantic redundancy detection to improve reasoning efficiency in large models by identifying when continued thinking provides no new insights, thus reducing computational waste while maintaining answer accuracy and reasoning quality.",
        "zh-CN": "提出PUMA框架，通过语义冗余检测实现推理模型高效计算，减少计算浪费并保持答案准确性和推理质量。",
        "ja": "PUMA は、意味の冗長性検出を使用して、継続的な思考が新たな洞察をもたらさないときに計算の無駄を減らすことで、大規模モデルの推論効率を向上させます。",
        "ko": "PUMA는 의미적 중복 감지를 통해 대규모 모델의 추론 효율성을 향상시키고, 계산 낭비를 줄이며 정답 정확도를 유지합니다.",
        "es": "Los investigadores presentan PUMA, un marco que utiliza detección de redundancia semántica para mejorar la eficiencia del razonamiento en modelos grandes al identificar cuándo el pensamiento continuo no proporciona nuevas perspectivas, reduciendo el desperdicio computacional manteniendo la precisión y calidad del razonamiento.",
        "de": "Forscher stellen PUMA vor, ein Framework, das semantische Redundanzdetektion verwendet, um die Effizienz großer Modelle bei der Schlussfolgerung zu verbessern, indem es identifiziert, wann weiteres Denken keine neuen Erkenntnisse bringt."
      }
    },
    {
      "arxivId": "2605.02290",
      "title": "Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding",
      "summary": "CoRD is a collaborative multi-teacher decoding framework that synthesizes reasoning trajectories through predictive perplexity scoring and beam search, enabling efficient distillation of large reasoning models with high-quality outputs and generalized performance.",
      "authors": [
        "Taewon Yun",
        "Jisu Shin",
        "Jeonghwan Choi",
        "Seunghwan Bang",
        "Hwanjun Song"
      ],
      "organization": {
        "_id": "6708fb8eb992dee2c3ffbaae",
        "name": "DISLab",
        "fullname": "Data Intelligence System Lab",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/63c9da8d5fdc575773c84816/YxqnL3XD4yK_dqZY3zlmr.png"
      },
      "publishedAt": "2026-05-04T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 36,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.02290.png",
      "arxivUrl": "https://arxiv.org/abs/2605.02290",
      "pdfUrl": "https://arxiv.org/pdf/2605.02290.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.02290",
      "githubRepo": "https://github.com/DISL-Lab/CoRD",
      "githubStars": 2,
      "keywords": [
        "distilling large reasoning models",
        "Long-CoT reasoning",
        "collaborative multi-teacher decoding",
        "predictive perplexity-based scoring",
        "beam search",
        "heterogeneous teachers"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "通过协作逐步多教师解码进行长CoT推理蒸馏",
      "summary_zh": "提出CoRD框架，通过预测困惑度评分和束搜索合成推理轨迹，实现高质量输出的高效大模型推理蒸馏。",
      "title_i18n": {
        "en": "Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding",
        "zh-CN": "通过协作逐步多教师解码进行长CoT推理蒸馏",
        "ja": "Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding",
        "ko": "Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding",
        "es": "Destilación del Razonamiento Long-CoT mediante Decodificación Paso a Paso con Múltiples Profesores Colaborativos",
        "de": "Distilling Long-CoT Reasoning through Collaborative Step-wise Multi-Teacher Decoding"
      },
      "summary_i18n": {
        "en": "CoRD is a collaborative multi-teacher decoding framework that synthesizes reasoning trajectories through predictive perplexity scoring and beam search, enabling efficient distillation of large reasoning models with high-quality outputs and generalized performance.",
        "zh-CN": "提出CoRD框架，通过预测困惑度评分和束搜索合成推理轨迹，实现高质量输出的高效大模型推理蒸馏。",
        "ja": "CoRD は、予測的な perplexity スコアリングとビームサーチを通じて、高品質な出力と汎化性能を持つ大規模推論モデルの効率的な蒸留を可能にする協働型マルチティーチャー推論フレームワークです。",
        "ko": "CoRD는 예측적 혼란도 점수와 비드 검색을 통해 추론 경로를 통합하여 고품질 출력과 일반화된 성능을 제공하는 협업형 멀티-강사 디코딩 프레임워크입니다.",
        "es": "CoRD es un marco de decodificación colaborativa con múltiples profesores que sintetiza trayectorias de razonamiento mediante puntuación de perplexidad predictiva y búsqueda de haz, permitiendo una destilación eficiente de modelos de razonamiento grandes con salidas de alta calidad y desempeño generalizado.",
        "de": "CoRD ist ein kooperatives Multi-Teacher-Decoding-Framework, das Schlussfolgerungspfade durch vorhersagbare Perplexitätsscores und Beam Search synthetisiert, um große Schlussfolgerungsmodelle effizient zu verdampfen."
      }
    },
    {
      "arxivId": "2605.15565",
      "title": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs",
      "summary": "AstraFlow is a dataflow-oriented reinforcement learning system that enables efficient multi-policy collaborative training and elastic scaling across diverse compute resources for large language model agents.",
      "authors": [
        "Haizhong Zheng",
        "Yizhuo Di",
        "Jiahui Wang",
        "Shuowei Jin",
        "Xueshen Liu",
        "Yongji Wu"
      ],
      "organization": {
        "_id": "654944a99b86bd6b2c577ba7",
        "name": "cmu-llm",
        "fullname": "Carnegie Mellon University",
        "avatar": "https://www.gravatar.com/avatar/51ec037487364270991176aaf5b58145?d=retro&size=100"
      },
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-19T00:00:00.000Z",
      "upvotes": 12,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15565.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15565",
      "pdfUrl": "https://arxiv.org/pdf/2605.15565.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15565",
      "githubRepo": "https://github.com/Infini-AI-Lab/astraflow",
      "githubStars": 25,
      "keywords": [
        "reinforcement learning",
        "large language models",
        "agentic RL",
        "multi-policy collaborative training",
        "elastic scaling",
        "heterogeneous compute resources"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "AstraFlow：面向数据流的强化学习系统",
      "summary_zh": "AstraFlow实现多策略协作训练和弹性扩展，提升大语言模型代理效率",
      "title_i18n": {
        "en": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs",
        "zh-CN": "AstraFlow：面向数据流的强化学习系统",
        "ja": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs",
        "ko": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs",
        "es": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs",
        "de": "AstraFlow: Dataflow-Oriented Reinforcement Learning for Agentic LLMs"
      },
      "summary_i18n": {
        "en": "AstraFlow is a dataflow-oriented reinforcement learning system that enables efficient multi-policy collaborative training and elastic scaling across diverse compute resources for large language model agents.",
        "zh-CN": "AstraFlow实现多策略协作训练和弹性扩展，提升大语言模型代理效率",
        "ja": "AstraFlowは、大規模言語モデルエージェントのための効率的なマルチポリシー協働トレーニングと弾性スケーリングを可能にするデータフロー指向の強化学習システムです。",
        "ko": "AstraFlow는 대규모 언어 모델 에이전트를 위한 효율적인 다중 정책 협업 학습과 다양한 컴퓨팅 자원에서의 유연한 확장을 가능하게 하는 데이터 흐름 중심 강화 학습 시스템입니다.",
        "es": "AstraFlow es un sistema de aprendizaje por refuerzo orientado a flujos de datos que permite entrenamiento colaborativo multi-político eficiente y escalabilidad elástica.",
        "de": "AstraFlow ist ein datenflussorientiertes Verstärkungslernsystem für agente LLMs, das effiziente Multi-Policy-Training und elastische Skalierung über verschiedene Rechenressourcen ermöglicht."
      }
    },
    {
      "arxivId": "2605.14445",
      "title": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale",
      "summary": "FrontierSmith automates the creation of open-ended coding problems from closed-ended tasks, improving LLM coding performance on benchmarks through diverse problem variants and enhanced agent interactions.",
      "authors": [
        "Runyuan He",
        "Qiuyang Mang",
        "Shang Zhou",
        "Kaiyuan Liu",
        "Hanchen Li",
        "Huanzhi Mao"
      ],
      "organization": null,
      "publishedAt": "2026-05-14T00:00:00.000Z",
      "submittedAt": "2026-05-15T00:00:00.000Z",
      "upvotes": 19,
      "comments": 2,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.14445.png",
      "arxivUrl": "https://arxiv.org/abs/2605.14445",
      "pdfUrl": "https://arxiv.org/pdf/2605.14445.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.14445",
      "githubRepo": "https://github.com/FrontierCS/FrontierSmith",
      "githubStars": 30,
      "keywords": [
        "LLM coding",
        "open-ended coding",
        "competitive programming",
        "automated problem generation",
        "idea divergence metric",
        "test case generation"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "FrontierSmith：大规模合成开放式编码问题",
      "summary_zh": "通过多样化问题变体和增强代理交互提升LLM编码性能",
      "title_i18n": {
        "en": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale",
        "zh-CN": "FrontierSmith：大规模合成开放式编码问题",
        "ja": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale",
        "ko": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale",
        "es": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale",
        "de": "FrontierSmith: Synthesizing Open-Ended Coding Problems at Scale"
      },
      "summary_i18n": {
        "en": "FrontierSmith automates the creation of open-ended coding problems from closed-ended tasks, improving LLM coding performance on benchmarks through diverse problem variants and enhanced agent interactions.",
        "zh-CN": "通过多样化问题变体和增强代理交互提升LLM编码性能",
        "ja": "FrontierSmithは閉じたタスクからオープンエンドなコーディング問題を自動生成し、多様な問題変種とエージェント間の相互作用によりLLMのコード性能を向上させます。",
        "ko": "FrontierSmith는 닫힌 형태 작업에서 열린 형태 코딩 문제를 자동으로 생성하여 다양한 문제 변형과 개선된 에이전트 상호작용을 통해 LLM 코딩 성능을 향상시킵니다.",
        "es": "FrontierSmith automatiza la creación de problemas de programación abiertos a partir de tareas cerradas, mejorando el rendimiento de LLM en benchmarks.",
        "de": "FrontierSmith automatisiert die Erstellung offener Programmieraufgaben aus geschlossenen Aufgaben, um die LLM-Programmierleistung durch vielfältige Problemvorgaben zu verbessern."
      }
    },
    {
      "arxivId": "2605.15726",
      "title": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR",
      "summary": "NudgeRL framework enhances reinforcement learning with verifiable rewards through structured exploration and strategy nudging to improve reasoning capabilities in large language models.",
      "authors": [
        "Chanuk Lee",
        "Sangwoo Park",
        "Minki Kang",
        "Sung Ju Hwang"
      ],
      "organization": {
        "_id": "6475760c33192631bad2bb38",
        "name": "kaist-ai",
        "fullname": "KAIST AI",
        "avatar": "https://cdn-avatars.huggingface.co/v1/production/uploads/6469949654873f0043b09c22/aaZFiyXe1qR-Dmy_xq67m.png"
      },
      "publishedAt": "2026-05-15T00:00:00.000Z",
      "submittedAt": "2026-05-18T00:00:00.000Z",
      "upvotes": 29,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.15726.png",
      "arxivUrl": "https://arxiv.org/abs/2605.15726",
      "pdfUrl": "https://arxiv.org/pdf/2605.15726.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.15726",
      "githubRepo": "https://github.com/tally0818/NudgeRL",
      "githubStars": 5,
      "keywords": [
        "reinforcement learning with verifiable rewards",
        "policy improvement",
        "exploration",
        "rollouts",
        "strategy-level contexts",
        "reward signal decomposition"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "超越舒适区的引导：RLVR的高效策略引导探索",
      "summary_zh": "提出NudgeRL框架，通过结构化探索和策略引导提升大语言模型的推理能力。",
      "title_i18n": {
        "en": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR",
        "zh-CN": "超越舒适区的引导：RLVR的高效策略引导探索",
        "ja": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR",
        "ko": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR",
        "es": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR",
        "de": "Nudging Beyond the Comfort Zone: Efficient Strategy-Guided Exploration for RLVR"
      },
      "summary_i18n": {
        "en": "NudgeRL framework enhances reinforcement learning with verifiable rewards through structured exploration and strategy nudging to improve reasoning capabilities in large language models.",
        "zh-CN": "提出NudgeRL框架，通过结构化探索和策略引导提升大语言模型的推理能力。",
        "ja": "NudgeRLフレームワークは構造化された探索と戦略の誘導を通じて検証可能な報酬で強化学習を強化し、大規模言語モデルの推論能力を向上させます。",
        "ko": "NudgeRL 프레임워크는 구조화된 탐색과 전략 안내를 통해 검증 가능한 보상을 통한 강화 학습을 향상시켜 대규모 언어 모델의 추론 능력을 개선합니다.",
        "es": "El marco NudgeRL mejora el aprendizaje por refuerzo con recompensas verificables mediante exploración estructurada y guía estratégica.",
        "de": "Der NudgeRL-Framework verbessert das Verstärkungslernen mit verifizierbaren Belohnungen durch strukturierte Exploration und Strategien zur Verbesserung der Schlussfolgerungsfähigkeiten."
      }
    },
    {
      "arxivId": "2605.19995",
      "title": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition",
      "summary": "Diffusion models applied in compressed image space generate high-quality images with lower computational cost and support flexible inputs like text or boxes.",
      "authors": [
        "Hongji Yang",
        "Songlian Li",
        "Yucheng Zhou",
        "Xiaotong Zhao",
        "Alan Zhao",
        "Chengzhong Xu"
      ],
      "organization": null,
      "publishedAt": "2026-05-19T00:00:00.000Z",
      "submittedAt": "2026-05-20T00:00:00.000Z",
      "upvotes": 27,
      "comments": 1,
      "thumbnail": "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/2605.19995.png",
      "arxivUrl": "https://arxiv.org/abs/2605.19995",
      "pdfUrl": "https://arxiv.org/pdf/2605.19995.pdf",
      "hfUrl": "https://huggingface.co/papers/2605.19995",
      "githubRepo": null,
      "githubStars": null,
      "keywords": [
        "diffusion models",
        "video generation",
        "creative intent cognition",
        "CogVLM",
        "in-context generation",
        "reinforcement learning"
      ],
      "source": "HF Daily Papers",
      "sourceType": "hf",
      "title_zh": "CogOmniControl：通过创意意图认知的推理驱动可控视频生成",
      "summary_zh": "基于GPT和LoRA实现视频生成的推理控制，提升生成内容的可控性与创造性。",
      "title_i18n": {
        "en": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition",
        "zh-CN": "CogOmniControl：通过创意意图认知的推理驱动可控视频生成",
        "ja": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition",
        "ko": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition",
        "es": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition",
        "de": "CogOmniControl: Reasoning-Driven Controllable Video Generation via Creative Intent Cognition"
      },
      "summary_i18n": {
        "en": "Diffusion models applied in compressed image space generate high-quality images with lower computational cost and support flexible inputs like text or boxes.",
        "zh-CN": "基于GPT和LoRA实现视频生成的推理控制，提升生成内容的可控性与创造性。",
        "ja": "圧縮画像空間で適用された拡散モデルは、テキストやボックスなどの柔軟な入力に対応し、低い計算コストで高品質な画像を生成します。",
        "ko": "압축된 이미지 공간에서 적용된 디퓨전 모델은 더 낮은 계산 비용으로 고품질 이미지를 생성하고 텍스트나 박스와 같은 유연한 입력을 지원합니다.",
        "es": "Los modelos de difusión aplicados en espacio de imagen comprimido generan imágenes de alta calidad con menor costo computacional.",
        "de": "Diffusionsmodelle im komprimierten Bildraum generieren hochwertige Bilder mit geringerem Rechenaufwand und unterstützen flexible Eingaben wie Text oder Boxen."
      }
    }
  ],
  "i18nUpdatedAt": "2026-05-20T06:13:03.993Z",
  "locales": [
    "en",
    "zh-CN",
    "ja",
    "ko",
    "es",
    "de"
  ]
}