fix: literature tool improvements (Round 79) (#121)

gasvn · web-flow · commit fcff825bd087 · 2026-03-05T16:02:26.000-05:00
* fix: literature tool improvements (Round 79)

Feature-79A-001: BaseRESTTool now applies JSON schema defaults to query params,
  fixing Semantic Scholar tools returning only paperId+title instead of full data
Feature-79A-002: PubMed_get_article returns structured JSON (title, abstract,
  authors, MeSH terms, DOI) instead of raw XML
Feature-79A-003: PubMed_get_related enriches results with article metadata
  (title, authors, journal) instead of returning bare PMID+score pairs
Feature-79B-004: OpenCitations_get_citations now has client-side limit (default
  100) to prevent 14K+ citation responses overflowing output buffers
Feature-79B-008: MultiAgentLiteratureSearch _parse_result now unwraps nested
  JSON from agent results, fixing the broken pipeline

* refactor: compress Round 79 code per code-simplifier principles

- Flatten nested conditionals in client_side_limit and elink enrichment
- Remove redundant variables and intermediate assignments
- Consolidate duplicate return paths in elink enrichment
- Simplify _parse_result control flow

* fix: remove unused variable to pass ruff lint check
diff --git a/src/tooluniverse/base_rest_tool.py b/src/tooluniverse/base_rest_tool.py
@@ -88,12 +88,34 @@ def _build_params(self, args: Dict[str, Any]) -> Dict[str, Any]:
         # Get param mapping for this API
         param_mapping = self._get_param_mapping()
 
-        # Only add arguments that aren't path parameters
+        # Params handled client-side only (not sent to API)
+        client_only = (
+            {"limit"}
+            if self.tool_config.get("fields", {}).get("client_side_limit")
+            else set()
+        )
+
         for key, value in args.items():
-            if f"{{{key}}}" not in url_template and value is not None:
-                # Use mapped parameter name if available
-                param_name = param_mapping.get(key, key)
-                params[param_name] = value
+            if (
+                key not in client_only
+                and f"{{{key}}}" not in url_template
+                and value is not None
+            ):
+                params[param_mapping.get(key, key)] = value
+
+        # Apply schema defaults for optional params not provided by the caller
+        for key, prop in (
+            self.tool_config.get("parameter", {}).get("properties", {}).items()
+        ):
+            if (
+                key in client_only
+                or key in params
+                or key in args
+                or f"{{{key}}}" in url_template
+            ):
+                continue
+            if "default" in prop and prop["default"] is not None:
+                params[param_mapping.get(key, key)] = prop["default"]
 
         return params
 
@@ -199,7 +221,23 @@ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
                 return special_result
 
             # Use default response processing
-            return self._process_response(response, url)
+            result = self._process_response(response, url)
+
+            # Client-side limit for APIs that return unbounded lists
+            if self.tool_config.get("fields", {}).get("client_side_limit"):
+                props = self.tool_config.get("parameter", {}).get("properties", {})
+                limit = arguments.get("limit", props.get("limit", {}).get("default"))
+                data = result.get("data")
+                if (
+                    limit is not None
+                    and isinstance(data, list)
+                    and len(data) > int(limit)
+                ):
+                    result["total_before_limit"] = len(data)
+                    result["data"] = data[: int(limit)]
+                    result["count"] = int(limit)
+
+            return result
 
         except Exception as e:
             return {
diff --git a/src/tooluniverse/compose_scripts/enhanced_multi_agent_literature_search.py b/src/tooluniverse/compose_scripts/enhanced_multi_agent_literature_search.py
@@ -266,21 +266,25 @@ def compose(
 
 
 def _parse_result(result):
-    """解析工具结果，确保返回字典格式"""
+    """Parse tool result into a dict, unwrapping nested JSON strings."""
     if isinstance(result, str):
         try:
             parsed = json.loads(result)
-            if isinstance(parsed, dict):
-                return parsed
-            else:
-                return {"result": parsed}
+            return parsed if isinstance(parsed, dict) else {"result": parsed}
         except Exception:
             return {"result": result}
-    elif isinstance(result, dict):
-        # 如果已经是字典，直接返回
+    if isinstance(result, dict):
+        # Unwrap nested JSON in "result" key (e.g. from agent tools)
+        inner = result.get("result")
+        if isinstance(inner, str):
+            try:
+                parsed = json.loads(inner)
+                if isinstance(parsed, dict):
+                    return parsed
+            except Exception:
+                pass
         return result
-    else:
-        return {"result": str(result)}
+    return {"result": str(result)}
 
 
 def _format_papers_for_summary(papers):
diff --git a/src/tooluniverse/data/opencitations_tools.json b/src/tooluniverse/data/opencitations_tools.json
@@ -91,14 +91,20 @@
         "doi": {
           "type": "string",
           "description": "DOI of the paper to find citations for. Do not include 'https://doi.org/' prefix. Examples: '10.1038/nature12373', '10.1002/jcc.21224'"
+        },
+        "limit": {
+          "type": ["integer", "null"],
+          "description": "Maximum number of citations to return. Highly-cited papers can have 10,000+ citations; use this to avoid oversized responses.",
+          "default": 100
         }
       },
       "required": [
         "doi"
       ]
     },
     "fields": {
-      "endpoint": "https://api.opencitations.net/index/v1/citations/{doi}"
+      "endpoint": "https://api.opencitations.net/index/v1/citations/{doi}",
+      "client_side_limit": true
     },
     "type": "BaseRESTTool",
     "test_examples": [
diff --git a/src/tooluniverse/pubmed_tool.py b/src/tooluniverse/pubmed_tool.py
@@ -287,6 +287,102 @@ def parse_article(pmid: str, article_data: Dict[str, Any]) -> Dict[str, Any]:
                 "error": f"Failed to fetch article summaries: {str(e)}",
             }
 
+    def _parse_efetch_xml(self, response) -> Dict[str, Any]:
+        """Parse PubMed efetch XML into structured article data."""
+        try:
+            root = ET.fromstring(response.text)
+        except ET.ParseError:
+            return {"status": "success", "data": response.text, "url": response.url}
+
+        def _text(el, path, default=""):
+            found = el.find(path) if el is not None else None
+            return found.text if found is not None and found.text else default
+
+        def _itertext(el, path):
+            found = el.find(path) if el is not None else None
+            return "".join(found.itertext()) if found is not None else ""
+
+        def _parse_article(article_el):
+            cit = article_el.find("MedlineCitation")
+            art = cit.find("Article") if cit is not None else None
+            if cit is None or art is None:
+                return None
+
+            pmid = _text(cit, "PMID")
+            title = _itertext(art, "ArticleTitle")
+
+            # Abstract: join labeled sections
+            abstract_parts = []
+            for at in art.findall("Abstract/AbstractText") or []:
+                label, text = at.get("Label", ""), "".join(at.itertext()).strip()
+                if text:
+                    abstract_parts.append(f"{label}: {text}" if label else text)
+            abstract = " ".join(abstract_parts)
+
+            # Authors (first 10)
+            authors = []
+            for au in (art.findall("AuthorList/Author") or [])[:10]:
+                last, fore = au.findtext("LastName", ""), au.findtext("ForeName", "")
+                name = f"{last} {fore}".strip() if last else fore
+                if not name:
+                    continue
+                entry = {"name": name}
+                aff = _text(au, ".//Affiliation")
+                if aff:
+                    entry["affiliation"] = aff
+                authors.append(entry)
+
+            journal_el = art.find("Journal")
+            journal = _text(journal_el, "Title") or _text(journal_el, "ISOAbbreviation")
+
+            doi = next(
+                (
+                    eid.text
+                    for eid in art.findall("ELocationID")
+                    if eid.get("EIdType") == "doi" and eid.text
+                ),
+                "",
+            )
+
+            pd = art.find(".//PubDate")
+            pub_year = _text(pd, "Year")
+            pub_date = " ".join(
+                filter(None, [pub_year, _text(pd, "Month"), _text(pd, "Day")])
+            )
+
+            mesh = [
+                d.text
+                for d in cit.findall("MeshHeadingList/MeshHeading/DescriptorName")
+                if d.text
+            ]
+            pub_types = [pt.text for pt in art.findall(".//PublicationType") if pt.text]
+
+            return {
+                "pmid": pmid,
+                "title": title,
+                "abstract": abstract or None,
+                "authors": authors,
+                "journal": journal or None,
+                "pub_date": pub_date,
+                "pub_year": pub_year,
+                "doi": doi or None,
+                "doi_url": f"https://doi.org/{doi}" if doi else None,
+                "url": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
+                "mesh_terms": mesh or None,
+                "publication_types": pub_types or None,
+            }
+
+        articles = [
+            a
+            for a in (_parse_article(el) for el in root.findall(".//PubmedArticle"))
+            if a
+        ]
+        data = articles[0] if len(articles) == 1 else articles
+        result = {"status": "success", "data": data, "url": response.url}
+        if len(articles) != 1:
+            result["count"] = len(articles)
+        return result
+
     def _fetch_abstracts(self, pmid_list: list[str]) -> Dict[str, str]:
         """Best-effort abstract fetch via efetch XML for a list of PMIDs."""
         pmids = [str(p).strip() for p in (pmid_list or []) if str(p).strip()]
@@ -503,6 +599,30 @@ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
                                     limit = None
                                 if limit is not None:
                                     links = links[:limit]
+
+                                # Enrich with article metadata
+                                pmids = [
+                                    str(lk["id"] if isinstance(lk, dict) else lk)
+                                    for lk in links
+                                ]
+                                scores = {
+                                    str(lk["id"]): lk.get("score")
+                                    for lk in links
+                                    if isinstance(lk, dict)
+                                }
+                                summary = self._fetch_summaries(pmids)
+                                if summary.get("status") == "success" and summary.get(
+                                    "data"
+                                ):
+                                    for item in summary["data"]:
+                                        score = (
+                                            scores.get(str(item.get("pmid", "")))
+                                            if isinstance(item, dict)
+                                            else None
+                                        )
+                                        if score is not None:
+                                            item["relevance_score"] = score
+                                    links = summary["data"]
                                 return {
                                     "status": "success",
                                     "data": links,
@@ -531,12 +651,8 @@ def run(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
                     "url": response.url,
                 }
             except Exception:
-                # For XML responses (efetch), return as text
-                return {
-                    "status": "success",
-                    "data": response.text,
-                    "url": response.url,
-                }
+                # For XML responses (efetch), parse into structured data
+                return self._parse_efetch_xml(response)
 
         except Exception as e:
             return {
diff --git a/tests/tools/test_literature_round79.py b/tests/tools/test_literature_round79.py