Files
build_tools/scripts/sdkjs_common/jsdoc/office-api/generate_jsonl_dataset.py
Nikita Khromov 099003b250 [jsdoc] Fix paths
2025-10-15 18:11:05 +07:00

250 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import re
import shutil
import argparse
import generate_docs_json
from datetime import datetime
# Configuration files
editors = [
"word",
"cell",
"slide",
"forms"
]
editors_names = {
"word": "Word",
"cell": "Spreadsheet",
"slide": "Presentation",
"forms": "Forms"
}
script_path = os.path.abspath(__file__)
root = os.path.abspath(os.path.join(os.path.dirname(script_path), '../../../../..'))
missing_examples = []
def load_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
def read_file_content(file_path):
try:
with open(file_path, encoding='utf-8') as f:
return f.read()
except Exception as e:
missing_examples.append(file_path)
# print(f"Failed to open file {file_path}: {e}")
return ""
def extract_js_comments_as_text(text):
# Extract single-line comments (after //)
single_line_comments = re.findall(r'^\s*//(.*)$', text, flags=re.MULTILINE)
# Extract multi-line comments (between /* and */)
multi_line_comments = re.findall(r'/\*(.*?)\*/', text, flags=re.DOTALL)
# Combine all found comments into a single list
all_comments = single_line_comments + multi_line_comments
# Join comments into a single text, separated by a space
return " ".join(comment.strip() for comment in all_comments if comment.strip())
def extract_examples_blocks(content: str):
blocks = []
current_block = {"comments": [], "code": []}
in_comment_section = True # Collect comments until code appears
current_comment_group = [] # Accumulate lines of the current comment
for line in content.splitlines():
stripped = line.strip()
if not stripped:
# Empty line
if in_comment_section and current_comment_group:
# Finish the current comment group
comment_text = " ".join(current_comment_group)
current_block["comments"].append(comment_text)
current_comment_group = []
elif not in_comment_section:
# Empty line in the code keep it as is
current_block["code"].append(line)
continue
if stripped.startswith("//"):
if in_comment_section:
# Remove comment marker and extra spaces
current_comment_group.append(extract_js_comments_as_text(stripped))
else:
# Comment after code starts finish the current block and start a new one
blocks.append({
"comments": current_block["comments"],
"code": "\n".join(current_block["code"]).rstrip()
})
current_block = {"comments": [], "code": []}
in_comment_section = True
# Start a new comment group with the current line
current_comment_group = [stripped[2:].strip()]
else:
# Code line
if in_comment_section:
if current_comment_group:
comment_text = " ".join(current_comment_group)
current_block["comments"].append(comment_text)
current_comment_group = []
in_comment_section = False
current_block["code"].append(line)
# Finalize any remaining comment group
if in_comment_section and current_comment_group:
comment_text = " ".join(current_comment_group)
current_block["comments"].append(comment_text)
# Save the last block if it's not empty
if current_block["comments"] or current_block["code"]:
blocks.append({
"comments": current_block["comments"],
"code": "\n".join(current_block["code"]).rstrip()
})
return blocks
def extract_examples_blocks_temp(content: str):
lines = content.splitlines()
comment_blocks = []
current_group = []
first_code_index = None
for i, line in enumerate(lines):
stripped = line.strip()
if not stripped:
if current_group:
comment_blocks.append(" ".join(current_group))
current_group = []
continue
if stripped.startswith("//"):
current_group.append(stripped[2:].strip())
else:
if current_group:
comment_blocks.append(" ".join(current_group))
current_group = []
first_code_index = i
break
code_part = ""
if first_code_index is not None:
code_part = "\n".join(lines[first_code_index:]).rstrip()
return [{"comments": comment_blocks, "code": code_part}]
def create_entry(system_message, user_message, assistant_message, model):
entry = {
"created_at": datetime.now().isoformat(" "),
"messages": [
{"role": "system", "content": system_message},
{"role": "user", "content": user_message},
{"role": "assistant", "content": assistant_message}
],
"recommended": False,
"upvoted": True
}
if model is not "":
entry["model"] = model
return entry
def process_doclets(doclets, output_entries, editor_name, model):
system_message = f'You are an expert in API for library from OnlyOffice. The library provides functions for editing {editors_names[editor_name]} documents. Your functional capabilities: 1) Explanation of Onlyoffice JavaScript API classes and their methods and parameters. 2) Assistance in writing Onlyoffice JavaScript API examples upon user request. 3) Reviewing user examples, assisting in finding and fixing their mistakes.'
for doclet in doclets:
kind = doclet.get("kind", "").lower()
see = doclet.get("see", [])
# The "see" field must always be present
if not see:
continue
# Processing based on the "kind" value
if kind == "function":
method_name = doclet.get("name", "")
memberof = doclet.get("memberof", "")
# Functions must have both "name" (method_name) and "memberof" fields filled
if not (method_name and memberof):
continue
default_user_message = f"How do I use the method {method_name} of {memberof} class?"
elif kind == "class":
class_name = doclet.get("name", "")
default_user_message = f"How do I instantiate or work with the class {class_name}?"
elif kind == "typedef":
typedef_name = doclet.get("name", "")
default_user_message = f"How do I use the typedef {typedef_name}?"
else:
continue
# Read the content of the first file listed in the "see" field
content = read_file_content(f'{root}/{see[0]}')
if content == "":
continue
# now use only first block cause there is bad comments in examples
blocks = extract_examples_blocks_temp(content)
for block in blocks:
assistant_message = block['code']
# default entry
output_entries.append(create_entry(system_message, default_user_message, assistant_message, model))
# If the file content contains comments, create a separate entry for each one
for comment in block['comments']:
output_entries.append(create_entry(system_message, comment, assistant_message, model))
def generate(output_dir, model):
os.chdir(os.path.dirname(script_path))
print('Generating documentation JSONL dataset...')
shutil.rmtree(output_dir, ignore_errors=True)
os.makedirs(output_dir)
generate_docs_json.generate(f'{output_dir}/tmp_json')
output_entries = []
output_filename = "dataset.jsonl"
for editor_name in editors:
input_file = os.path.join(f'{output_dir}/tmp_json', editor_name + ".json")
doclets = load_json(input_file)
process_doclets(doclets, output_entries, editor_name, model)
with open(f'{output_dir}/{output_filename}', "w", encoding="utf-8") as out_file:
for entry in output_entries:
out_file.write(json.dumps(entry, ensure_ascii=False) + "\n")
shutil.rmtree(f'{output_dir}/tmp_json')
print('Done')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate documentation JSONL dataset")
parser.add_argument(
"destination",
type=str,
help="Destination directory for the generated documentation",
nargs='?', # Indicates the argument is optional
default=f"{root}/office-js-api/dataset" # Default value
)
parser.add_argument(
"model",
type=str,
help="Type of model",
nargs='?', # Indicates the argument is optional
default="" # Default value
)
args = parser.parse_args()
generate(args.destination, args.model)
print("START_MISSING_EXAMPLES")
print(",".join(missing_examples))
print("END_MISSING_EXAMPLES")