Feat: include author, journal name, volume, issue, page, and DOI in PubMed search results (#10481)

### What problem does this PR solve?

issue:
[#6571](https://github.com/infiniflow/ragflow/issues/6571)
change:
include author, journal name, volume, issue, page, and DOI in PubMed
search results

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
buua436
2025-10-11 16:00:16 +08:00
committed by GitHub
parent 52f26f4643
commit a0d5f81098

View File

@ -85,13 +85,7 @@ class PubMed(ToolBase, ABC):
self._retrieve_chunks(pubmedcnt.findall("PubmedArticle"), self._retrieve_chunks(pubmedcnt.findall("PubmedArticle"),
get_title=lambda child: child.find("MedlineCitation").find("Article").find("ArticleTitle").text, get_title=lambda child: child.find("MedlineCitation").find("Article").find("ArticleTitle").text,
get_url=lambda child: "https://pubmed.ncbi.nlm.nih.gov/" + child.find("MedlineCitation").find("PMID").text, get_url=lambda child: "https://pubmed.ncbi.nlm.nih.gov/" + child.find("MedlineCitation").find("PMID").text,
get_content=lambda child: child.find("MedlineCitation") \ get_content=lambda child: self._format_pubmed_content(child),)
.find("Article") \
.find("Abstract") \
.find("AbstractText").text \
if child.find("MedlineCitation")\
.find("Article").find("Abstract") \
else "No abstract available")
return self.output("formalized_content") return self.output("formalized_content")
except Exception as e: except Exception as e:
last_e = e last_e = e
@ -104,5 +98,50 @@ class PubMed(ToolBase, ABC):
assert False, self.output() assert False, self.output()
def _format_pubmed_content(self, child):
"""Extract structured reference info from PubMed XML"""
def safe_find(path):
node = child
for p in path.split("/"):
if node is None:
return None
node = node.find(p)
return node.text if node is not None and node.text else None
title = safe_find("MedlineCitation/Article/ArticleTitle") or "No title"
abstract = safe_find("MedlineCitation/Article/Abstract/AbstractText") or "No abstract available"
journal = safe_find("MedlineCitation/Article/Journal/Title") or "Unknown Journal"
volume = safe_find("MedlineCitation/Article/Journal/JournalIssue/Volume") or "-"
issue = safe_find("MedlineCitation/Article/Journal/JournalIssue/Issue") or "-"
pages = safe_find("MedlineCitation/Article/Pagination/MedlinePgn") or "-"
# Authors
authors = []
for author in child.findall(".//AuthorList/Author"):
lastname = safe_find("LastName") or ""
forename = safe_find("ForeName") or ""
fullname = f"{forename} {lastname}".strip()
if fullname:
authors.append(fullname)
authors_str = ", ".join(authors) if authors else "Unknown Authors"
# DOI
doi = None
for eid in child.findall(".//ArticleId"):
if eid.attrib.get("IdType") == "doi":
doi = eid.text
break
return (
f"Title: {title}\n"
f"Authors: {authors_str}\n"
f"Journal: {journal}\n"
f"Volume: {volume}\n"
f"Issue: {issue}\n"
f"Pages: {pages}\n"
f"DOI: {doi or '-'}\n"
f"Abstract: {abstract.strip()}"
)
def thoughts(self) -> str: def thoughts(self) -> str:
return "Looking for scholarly papers on `{}`,” prioritising reputable sources.".format(self.get_input().get("query", "-_-!")) return "Looking for scholarly papers on `{}`,” prioritising reputable sources.".format(self.get_input().get("query", "-_-!"))