diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/.gitignore" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/.gitignore" new file mode 100644 index 0000000000000000000000000000000000000000..dc683abc101a122659930791875ec01a828c0a98 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/.gitignore" @@ -0,0 +1,20 @@ +# Python +**/__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +env/ +venv/ +**/*_venv/ + +# IDE +.vscode/ +.vscode-server/ +.cursor/ +.cursor-server/ +.idea/ + +# special +**/results/ +**/tmp/ \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/README.md" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/README.md" new file mode 100644 index 0000000000000000000000000000000000000000..428df6008b990878642f8a93d7b68698c1d34a7b --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/README.md" @@ -0,0 +1,316 @@ +# 验收报告 +## 1. 项目概述 +### 1.1 项目背景 +OpenCloudOS 9在AI生态建设过程中面临的主要挑战: +- AI软件栈数量庞大,逐一手工验证耗时耗力 +- 部分软件上游只针对Ubuntu等其他发行版开发,不兼容OpenCloudOS 9 +- 缺乏标准化的验证流程和自动化工具 + +### 1.2 项目目标 +构建一个智能化的AI软件自动化验证工具,实现: +- 自动分析软件文档,获取安装方式 +- 自动生成测试用例 +- 批量验证Python软件包兼容性 +- 标准化测试框架和输出格式 + +## 2. 思路分析 +经过文献调研,以及基于出题组给出的PyPI软件包列表,我们发现软件包安装和测试失败的原因可以归为以下几类: +- 依赖冲突:不同软件对于同一个库的依赖版本可能不同,或者有些软件无法在同一个环境中共存。 +- 系统环境缺失:一些特殊软件需要依赖系统环境。比如对于需要依赖GPU的软件,系统中需要有GPU相应的驱动程序,如`OpenGL`,`OpenCL`,`CUDA`等驱动。如果缺失这些驱动,软件在进行验证时就会报错,导致验证失败。 +- 包名不匹配:使用pip安装的包名可能与在代码中import时的包名不一致,导致import失败。比如`opencv-python`在安装时使用`pip install opencv-python`,但是在导入时使用的是`import cv2`。 +- 软件长时间未维护:比如`gnes`已经三年未维护,仍用`legacy setup.py bdist_wheel`与旧版Cython语法,导致软件无法成功编译安装。 +- 验证代码出错:比如大模型给`mmdnn`包生成的验证代码为`import mmdnn; print(mmdnn.__version__)`,但`mmdnn`并没有`__version__`变量,导致测试出错。 + +针对上述问题,结合当下热门的大模型和MCP工具,本项目确定了如下思路: +- 由于需要实现自动化验证,而不同软件的验证方式各不相同,且很难有通用的方式;因此可以结合当下热门的MCP工具,由MCP工具自动生成相关安装命令和验证代码。由于有些软件包十分复杂,比如`pandas`就有多达60个extras软依赖,对软件进行详细的功能测试并不现实,因此在生成验证代码时只生成核心功能的测试代码,包括软件包导入测试、软件包基本功能测试和GPU使用测试。 +- 针对依赖冲突问题,首先可以先使用MCP工具去分析软件包的依赖关系,进而生成拓扑排序,最后根据该排序依次安装软件包;另一方面,当发生由于依赖问题导致的安装失败时,创建一个虚拟环境后再在该环境下重新安装软件包。 +- 针对环境缺失问题,这里我们结合了`“验证前检测”`和`“报错时解决”`两种方式。 + + 对于`“验证前检测”`方式,我们需要先了解python软件包的安装过程。软件包在打包发布到PyPI仓库时会将相关的动态库以及源代码打包到`wheel`中;当我们下载后,该`wheel`会被验证,解压,最后复制到`site-packages`目录下;然后`pip`管理器会根据这些文件生成`-.dist-info`目录来管理元数据,其中值得关注的是`top_level.txt`文件,因为它记录的是这个包的顶级模块名(即`import`时使用的包名),这可以用于解决包名不匹配的问题;最后就是注册入口点、校验依赖以及记录文件清单。 + + 我们通过分析下载的软件包能否正确链接到系统环境可以提前判断并解决环境缺失问题。因此在下载了软件包后,我们可以在该软件包的`site-packages`下查找软件包的动态库,并通过`ldd`命令查看该动态库是否能正确链接到系统环境中的依赖库,如果有依赖缺失,再通过大模型分析解决。 + + 而对于`“报错时解决”`方式,又分为`安装时报错`和`测试时报错`,我们可以在报错时收集系统环境信息、所使用的指令以及报错信息,将这些信息传递给MCP工具,由MCP工具自动分析解决。 +- 针对包名不匹配问题,使用分析`top_level.txt+大模型分析`的方式找到正确import的包名。 +- 针对验证代码出错问题,我们同样可以在`“报错时解决”`中进行解决,通过给大模型提供验证代码和报错信息,大模型自动修改验证代码并验证,直到成功。 + +## 3. 系统设计 +### 3.1 系统执行流程 +![系统执行流程](assets/AI软件自动验证工具.png) +- 其中AI agent工具和MCP工具是可选功能,加上该工具后该系统才能更加智能地安装和验证软件包。 +- 下文将对各个组件的设计进行介绍。 + +### 3.2 从Github仓库获取PyPI软件包 +#### 3.2.1 从Github收集仓库信息 +- 根据用户提供的topic来从github中筛选符合条件的仓库,并记录相关仓库的地址,便于后续分析。 +- 为了保证获取的仓库的影响力,选择获取star数大于1000的仓库。 + +#### 3.2.2 PyPI软件包获取 +- 分析该Github仓库的主语言,如果是Python,再查看该仓库的`README.md`中是否有`pip install`命令,如果有则提取其中的PyPI包名并记录下来。 + +### 3.3 分析软件包 +- 使用MCP工具自动完成如下任务: +- 查看该包名是否在PyPI仓库中存在,存在则继续进行下面的分析。 +- 由于观察到开发者在将包发布到PyPI仓库时会将依赖信息、环境要求一并上传,因此我们会根据包名从PyPI仓库中获取依赖信息,包括版本限制,以辅助后续分析;但是,开发者在PyPI仓库中存放的信息并不一定准确,比如软件包`accountant-0.0.6`在PyPI仓库中记录他的依赖包`enum>=1.1.5`,但是PyPI仓库中最新的`enum`版本是0.4.7,说明该依赖信息并不准确。因此,我们将进一步获取更多信息来尽可能得到准确信息。我们观察到开发者还会将该软件包的Github地址一并放入PyPI仓库的元数据中,因此我们也可以通过这个信息获取Github中的README.md,requirements.txt等文件进一步获取相关依赖。除此之外,我们还利用大模型自身的知识储备来进一步修改和完善获得的依赖列表。通过上述方法的结合,我们最终得到较为准确的依赖列表。 +- 由于有些AI软件包需要依赖GPU来运行,我们同样从PyPI仓库中的元数据、Github仓库中查看该软件包是否依赖GPU,并使用大模型来进一步确认该软件包是否依赖GPU。 +- 本项目需要验证软件包是否能够正确执行,因此我们对每个软件包都进行三种测试,分别是:导入测试、基本功能测试、GPU使用测试。由于这部分测试代码并没有通用的模式,因此我们选择让大模型根据自身知识储备来生成测试代码以及相应的预期结果。 + +### 3.4 软件包安装与验证 +#### 3.4.1 软件包的安装 +- 为了不对本地的python环境造成影响,我们在venv创建的虚拟环境中进行安装和验证。 +- 根据前面提供的软件包信息,我们可以获得每个软件包的依赖关系,在执行批量安装前,我们可以根据这些依赖关系基于图论算法生成软件包的安装顺序,这不仅可以降低安装失败的概率(因为安装某个包前他的依赖已经成功安装),同时,在安装某个包前我们就可以将这个包的依赖包的版本限制与已安装的包的版本进行对比,找到可能的版本冲突情况。 +- 本项目会选择一个与当前所要安装的包的版本限制不冲突的虚拟环境,如果该环境不存在,则创建一个新的虚拟环境,然后使用pip或者uv进行软件包的安装。 +- 安装完成之后,我们还需要进行`“验证前检测”`操作。PyPI软件包一般使用`wheel`进行打包,在打包过程中会将软件所需要的动态库也一并打包。因此,我们在`site-packages`下查找该软件包所包含的动态库,通过`ldd`命令查看这些动态库能否正确链接到环境中的库文件。如果有未链接到的情况,则交由大模型分析并解决(如使用系统的包管理工具自动安装缺失的依赖)。 +- 但在获取软件包的动态库时仍可能出现问题,因为`site-packages`下的软件包使用的是导入时的包名,可能与安装时使用的包名不一致。为了解决这个问题,我们可以利用`-.dist-info`目录中的`top_level.txt`文件找到正确的导入包名,进而正确定位该包的动态库。 +- 安装失败时,系统会生成安装失败信息。 + +#### 3.4.2 软件包的验证 +- 如果能够正确安装与验证,我们就可以开始执行验证代码,验证分为导入验证、基本功能验证和GPU使用验证。 +- 对于导入验证,由于安装的包名可能与import时使用的包名不一致,且大模型的幻觉可能导致导入错误的包名,从而导致测试失败。比如`opencv-python`在安装时使用`pip install opencv-python`,但是在导入时使用的是`import cv2`。不幸的是,正确的导入包名只有在安装完成之后通过读取该包的`top_level.txt`才能够知道,且这些包名有些属于内部接口,比如`pyyaml`的`top_level.txt`中包括`yaml`和`_yaml`两个顶级模块,但是`_yaml`属于内部API,不应该在import时使用。基于上述观察,我们首先从下载的软件包中获取顶级模块名,再将这些顶级模块名交给大模型,由它判断哪些包属于import测试时使用的包。 +- 对于基本功能验证,由于使用大模型分析并生成的测试用例可能有错误,因此当发生验证失败时,我们将所使用的虚拟环境、验证代码、验证结果一并交给大模型分析,由大模型判断验证失败原因,如果是由于代码错误导致的验证失败,则生成新的验证代码并进行测试。 +- 对于GPU使用测试,如果在软件包分析阶段没有生成相关代码,则认为该软件无需GPU;否则进行GPU测试。在测试过程中我们发现,有些软件包在使用系统环境中的动态链接库时使用的是运行时动态链接方式,无法在测试前通过ldd命令发现,因此我们设计了`“报错时解决”`方法,即在测试发生错误时,使用大模型分析是否因动态链接库缺失导致测试失败,若是,则使用系统的包管理工具进行安装,再重新执行测试。 +- 结果收集:首先,对于在验证阶段对代码的修改,我们会同步到数据库/json文件中;其次,我们会标准化收集测试结果和日志,并输出最终结果。 + +### 3.5 AI Agent和MCP +- MCP是一个开放协议,它为应用程序向LLM提供上下文的方式进行了标准化,帮助你在LLM的基础上构建代理(agents)和复杂的工作流。 +- MCP Servers:服务器是提供外部数据和工具的组件。考虑到维护系统安全,我们不能直接允许大模型随意调用系统中的工具,因此,我们需要最小化MCP Server的能力范围,例如,在卸载PyPI包时,只允许LLM卸载自己安装的软件包,允许LLM使用系统包管理器安装系统软件,但不允许其卸载系统软件等。为了实现通过MCP来完成软件包的分析、安装和验证,我们提供了四类服务:Github信息获取服务、PyPI包分析服务、依赖分析服务和软件包测试服务。 +- MCP Clients:客户端是主机与服务器之间的桥梁。它与服务器保持一对一的连接,并为服务器初始化环境信息(如Python执行器路径、系统环境变量);同时,它为大模型提供展示MCP工具、执行MCP工具等功能。 +- 整体工作流程:系统通过给LLM提供`System Message`来要求LLM使用MCP工具解决问题,LLM使用`Assistant Message`以json格式调用MCP工具,MCP工具以`User Message`或`Tool Message`返回执行结果,经过多轮消息传递,LLM最后返回处理结果给系统。 + +## 4. 核心功能实现 +### 4.1 数据格式说明 +1. 为了代码设计的灵活性,我们在`package_info.py`中使用继承于BaseModel的PackageInfo类和Package类来在各个模块之间传输数据,每个类的属性如下: +```python +class PackageInfo(BaseModel): + """包信息的数据模型""" + dependency: List[str] = Field(default_factory=list) # 该包的依赖数组,如[numpy >=1.6.0, pandas Any] + import_test_code: str = "" # 导入验证代码 + import_test_expected_result: str = "" # 导入验证预期结果 + function_test_code: str = "" # 基本功能验证代码 + function_test_expected_result: str = "" # 基本功能验证预期结果 + gpu_test_code: str = "" # GPU使用验证代码 + gpu_test_expected_result: str = "" # GPU使用验证预期结果 + verified: str = "False" # 是否已完成验证 +``` + +```python +class Package(BaseModel): + """完整的包记录模型""" + package_name: str # 包名 + info: PackageInfo # PackageInfo类 + exists: bool # 该包是否在PyPI仓库中存在 +``` + +2. 在存储数据时,本项目支持数据库和json文件两种存储格式,并将格式转换功能封装在`package_converter.py`中,将数据库/文件操作封装在`package_repository.py`中 + +### 4.2 从Github仓库获取PyPI软件包功能实现 +#### 4.2.1 根据指定的topic从Github仓库中获取PyPI包名,包括如下步骤: +1. 使用指定topic,从Github获取仓库信息,为了保证获取的仓库认可度足够高,选择star数在1000以上的仓库,并将信息记录在.csv文件中,仓库信息包括如下内容: +- "name": 仓库名, +- "url":仓库地址, +- "language":仓库主语言, +- "stars":仓库star数, +- "description":仓库描述信息, +- "updated_at":仓库更新日期。 +2. 读取.csv文件,根据文件中仓库的地址,从仓库中读取README.md文件内容,通过大模型分析其中是否有PyPI包,如果有,则将该仓库的所有包名以列表的形式记录为`package_list`,如`[numpy, pandas, ...]`,并将`name`,`url`,`package_list`记录到.txt文件中。 + +#### 4.2.2 根据获取的PyPI包名生成Package信息 +1. 读取pypi.txt文件,获取全部的PyPI包名,调用`assign_task_to_llm_mcp`函数,并将PyPI包名作为参数传递给MCP模块 +2. 在MCP模块中,通过`system`角色向大模型发起任务,要求大模型按照如下步骤分析该PyPI包 +- 调用`check_pypi_mcp`工具检测该PyPI包名是否在PyPI仓库中存在,若不存在,则说明该包名有误,直接停止; +- 如果该包存在,则调用`find_dependency_for_pip_package_mcp`工具,并结合自身的知识储备找到该包的依赖包及每个包的版本限制; +- 如果该包存在,则调用`find_gpu_requirement_for_pip_package_mcp`工具,并结合自身的知识储备判断该包是否需要GPU实现完整功能; +- 如果该包存在,根据上述信息,生成验证代码和预期验证结果,包括:导入测试、核心功能测试和GPU使用测试。对于每个测试,还要生成相应的预期测试结果。由于有些结果在执行前无法预知,比如打印时间等功能,则要求生成的预期测试结果使用正则表达式书写,在正式测试时通过判断测试结果能否匹配正则表达式来决定测试是否通过。 +- 将生成的Package信息保存到数据库/json文件中。 + +### 4.3 软件包安装与验证功能实现 +#### 4.3.1 软件拓扑排序生成 +1. 从数据库/json文件中获取全部`exists`字段为`true`的Package信息,读取`package_name`字段和`info.dependency`字段,并组织为`{"package_name": ["dependency", ...]}`的形式。 +2. 调用`build_dependency_graph`抽离出包名(剔除版本限制部分),然后构建`被依赖包->依赖包`的有向无环图结构。 +3. 调用`topological_sort`根据该有向无环图生成拓扑排序,若在生成拓扑排序过程中发现有环,则生成失败,退出程序。 + +#### 4.3.2 环境预处理 +1. 根据生成的拓扑排序,系统逐一进行安装与验证。由于拓扑排序中包括一些被依赖包,他们并没有在数据库/json文件中记录相关的Package信息,因此这些包将在默认虚拟环境下安装。 +2. 对于在数据库/json文件中有记录Package信息的包,系统首先会查看默认的虚拟环境是否能满足要求;具体来说,系统调用`detect_potential_version_conflicts`来将默认虚拟环境中已安装包的版本与`info.dependency`中的版本限制作对比,如果在范围内,则认为该虚拟环境满足要求。如果存在冲突,则系统会遍历所有虚拟环境,直到找到符合要求的虚拟环境或所有虚拟环境都不能满足要求。如果所有环境都不能满足要求,则调用`create_venv`创建一个新的虚拟环境。 + +#### 4.3.3 软件包安装 +1. 系统在选择正确的虚拟环境后,将在该虚拟环境下调用`install_package`安装软件。 +2. 如果软件安装成功,则继续往下执行;如果软件安装失败,会调用`assign_task_to_llm_mcp`进行分析并解决,所传递的参数如下所示: +- PyPI包名, +- 安装命令(pip或uv), +- 标准输出(stdout), +- 错误信息(stderr)。 +3. 在MCP模块中,通过`system`角色向大模型发起任务,要求大模型按照如下步骤解决安装失败的问题: +- 分析安装失败原因。 +- 使用自身知识储备以及MCP工具来解决安装失败的问题,包括: + + 如果该错误是由于缺少系统依赖,使用`detect_system_package_manager_mcp`来查看当前系统使用什么包管理器(dnf/yum/apt等),然后调用`install_system_package_mcp`安装缺失的系统包,再重新安装该PyPI包。 + + 如果该错误是由于缺失PyPI软件包,则使用`install_pypi_package_mcp`工具安装缺失的PyPI包,再重新安装该PyPI包。 + + 如果该错误是由于错误的虚拟环境配置,则使用`create_virtual_env_mcp`工具来创建新的虚拟环境,并在该环境中重新安装PyPI包。 +- 重复上述过程直到包能成功安装或者超过重试次数。 +- 如果安装失败则记录安装结果,成功则进行软件包的验证。 + +#### 4.3.4 系统环境验证 +1. 成功安装软件包后,调用`pre_resolve_environment`分析软件包能否正确链接到系统的依赖库。`pre_resolve_environment`会首先调用`find_dynamic_libs`来找到该软件包的动态文件,然后调用`analyze_lib_with_ldd`查看该动态文件是否能正确链接到系统动态库,如果有未链接到的情况,则调用`auto_install_missing_dependencies`自动安装依赖,再次检查链接情况直到动态文件能正确链接到系统动态库。 +2. 在`find_dynamic_libs`中,它首先调用`detect_import_names`,根据`site-packages`目录下的`-.dist-info`找到软件的`top_level.txt`文件,进而通过大模型分析其中的顶级模块名中哪些是导入时使用的包名。通过这个包名,`find_dynamic_libs`可以在包名目录下遍历`.so`文件,并通过`ldd`命令查看文件的链接情况,最终确定有无链接失败的情况发生。 +3. 在`auto_install_missing_dependencies`中,系统会根据依赖缺失情况自动安装软件包,由于缺失的依赖可能来自某个软件包的某个动态库,因此需要使用大模型分析,找到使用系统包管理器安装时需要的包名,最后执行安装命令。 + +#### 4.3.5 软件包验证 +1. 如果该包没有`Package`信息,说明该包仅是依赖包,而不是需要验证的包,因此跳过验证,否则开始生成验证代码。 +2. 通过`Package`的`info.import_test_code`、`info.import_test_expected_result`、`info.function_test_code`、`info.function_test_expected_result`、`info.gpu_test_code`、`info.gpu_test_expected_result`等信息生成测试用例和预期结果(使用正则表达式表示),在生成信息时,由于有些软件包需要从`HuggingFace`上下载小模型验证功能,但是国内并不能直接连接`HuggingFace`,因此在生成代码时会添加`import os; os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"`以从国内的镜像网站下载模型测试。 +3. 在测试时,如果出现报错,会调用MCP模块解决,通过`system`角色向大模型发起任务,要求大模型按照如下步骤解决安装失败的问题: +- 分析执行出错原因。 +- 使用自身知识储备以及MCP工具尝试解决如下错误: + + 如果是语法错误,则直接修改测试代码并重新测试; + + 如果提示模块名未找到,则调用`detect_import_name_mcp`找到正确的导入模块名,并重新测试; + + 如果该错误是由于缺少系统依赖,使用`detect_system_package_manager_mcp`来查看当前系统使用什么包管理器(dnf/yum/apt等),然后调用`install_system_package_mcp`安装缺失的系统包,然后重新测试; + + 如果该错误是由于错误的虚拟环境配置,则使用`create_virtual_env_mcp`工具来创建新的虚拟环境,并在该环境中重新测试。 +- 重复上述过程直到解决问题或者超过重试次数。 +4. 最后将测试结果归类为`COMPATIBLE`和`INCOMPATIBLE`两类,分别代表通过全部测试和有测试未通过。 + +## 运行命令 +1. 环境准备 +- 安装GPU相关的驱动,安装jq工具 +- 创建虚拟环境作为本系统执行的环境 +- 修改mcp_servers_config.json文件 +- 建立results文件夹和tmp文件夹,将软件列表.xlsx放到tmp文件夹下 +- 创建venvs目录用于存放安装和验证时的虚拟环境 +- 修改config.json文件 +2. 运行 +- 若要运行从Github获取PyPI包名到验证的完整过程,运行run.sh脚本,命令如下: + ```shell + sh ./run.sh --venv=/root/.main_venv --use-llm --topic=ai --fetch-github-repos --generate-package-info --verify-packages + ``` +- 如果要测试出题组提供的软件列表,则运行run_for_软件列表.sh + ```shell + sh ./run_for_软件列表.sh --venv=/root/.main_venv --use-llm --generate-package-info --verify-packages + ``` +3. 运行结果说明 +- results文件夹下保存着每个软件的详细测试结果信息和最终报告信息 +- 最终报告说明 + 最终报告会统计如下信息: + + `total_packages`: 总共需要验证的包数; + + `not_found_packages`: 无法在PyPI仓库中找到的包数,可能由于包名列表中的信息有误或者其他原因; + + `total_exists_packages`: 在PyPI仓库中能找到的包数; + + `create_env_failed_packages`: 创建虚拟环境失败而导致失败的包数, + + `install_failed_packages`: 安装失败的包数; + + `env_resolve_failed_packages`: 系统环境无法解决而失败的包数; + + `verify_failed_packages`: 验证失败的包数,即验证状态为`INCOMPATIBLE`; + + `successful_packages`: 成功验证的包数,即验证状态为`COMPATIBLE`; + + `install_rate`: 安装成功率,即安装成功包数/在PyPI仓库中能找到的包数; + + `compatibility_rate`: 验证成功率,即验证成功的包数/在PyPI仓库中能找到的包数; + + `install_rate_total`: 总安装成功率,即安装成功包数/总共需要验证的包数; + + `compatibility_rate_total`: 总验证成功率,即验证成功包数/总共需要验证的包数; + + `details`: 每个包的安装验证细节, + + `timestamp`: 当前时间戳 + +4. 推荐运行方式 +- 环境准备 + + 首先使用官方提供的`opencloudos/opencloudos9-minimal:latest` docker镜像,使用该镜像创建一个docker实例,记得指定`--gpus all`,推荐命令如下: + ```shell + docker run -it --name opencloudos9 --gpus all -p 8000:8000 opencloudos/opencloudos9-minimal:latest bash + ``` + + 设置python软链接,安装pip + ```shell + ln -s /usr/bin/python3 /usr/bin/python + dnf install -y python3-pip + python -m pip install -U pip # 更新pip到最新版 + ``` + + 安装jq,用于解析json文件 + ```shell + dnf install jq + ``` + + 在code目录下创建虚拟环境作为项目执行环境 + ```shell + cd code + python -m venv .main_venv + ``` + + 在code目录下新建tmp目录,results目录,venvs目录 + + 修改code/mcp_chat_bot/mcp_servers/mcp_servers_config.json文件(推荐使用绝对路径) + ```json + { + "mcpServers": { + "github_analyst": { + "command": "/root/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/github_analyst.py" + ] + }, + "pypi_analyst": { + "command": "/root/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/pypi_analyst.py" + ] + }, + "dependency_analyst": { + "command": "/root/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/dependency_analyst.py" + ] + }, + "test_executor": { + "command": "/root/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/test_executor.py" + ] + } + } + } + ``` + + 在code目录下修改config.json文件(推荐使用绝对路径) + ```json + { + "github_access_token": "", + "llm_model_name": "deepseek-chat", + "llm_access_token": "", + "llm_base_url": "https://api.deepseek.com/chat/completions", + + "//": "数据存储的方式和位置,支持json文件保存和数据库保存", + "save_method": "db", + "json_file_path": "/root/oc_contributor_huangzhenye/code/package_manager/package_info.json", + "db_path": "/root/oc_contributor_huangzhenye/code/package_info.db", + "result_path": "/root/oc_contributor_huangzhenye/code/results", + + "//": "tmp目录位置,用于存放软件列表.xlsx以及临时文件", + "tmp_path": "/root/oc_contributor_huangzhenye/code/tmp", + + "//": "安装和验证时存放虚拟环境的路径,注意不要将项目运行时使用的虚拟环境放在这里,避免破坏项目的运行环境", + "venvs_path": "/root/oc_contributor_huangzhenye/code/venvs" + } + ``` + + 测试MCP能否正常工作 + ```shell + source code/.main_venv/bin/activate + pip install -r code/requirements.txt + python code/mcp_chat_bot/unit_tests/single_prompt.py --url https://github.com/numpy/numpy + ``` +- 运行 + + 在code目录下运行shell脚本 + ```shell + sh ./run_for_软件列表.sh --venv=/root/oc_contributor_huangzhenye/code/.main_venv --use-llm --generate-package-info --verify-packages + ``` +### 注意事项 +1. 由于MCP会使用dnf安装软件,推荐在root权限下运行(这个后续可以继续优化) +2. 由于生成信息、安装和测试的过程较为复杂,在本地测试过程中,根据`软件列表.xlsx`生成`Package`信息可能需要半天至一天时间;而安装和验证过程由于本地磁盘空间不够,没有进行过完整测试,估计完全安装和验证需要一到两天。如果希望进行样本测试,可以减少`软件列表.xlsx`文件的行数,再进行测试。 +3. 如果希望加快速度,我们还在important文件夹下提供了一个`package_info.db`数据库,里面存放的是由项目根据`软件列表.xlsx`生成的`Package`信息。将该数据库放到code目录下,然后再执行 +```shell +sh ./run_for_软件列表.sh --venv=/root/oc_contributor_huangzhenye/code/.main_venv --use-llm --verify-packages +``` +即可开始验证代码。 + +## 总结与展望 +### 总结 +1. 在工作初期,我将目光聚焦于如何更详细地获取PyPI包及其相关信息上,并最终确定了先分析包是否存在,再从PyPI仓库中获取详细信息的方式。在安装代码时,我第一想法是希望按照包的依赖顺序依次安装,以便预先发现版本冲突的情况,再通过创建新虚拟环境的方式避免依赖冲突。在验证代码时,我想到AI软件包大部分需要使用GPU来实现完整功能,因此设计了导入测试、基本功能测试和GPU使用测试三类测试方法。 +2. 我发现生成包信息的过程复杂且多变,还需要使用大模型生成验证代码,为了能更智能的获取生成包信息,我了解了MCP协议,并通过使用一个[开源项目](https://github.com/keli-wen/mcp_chatbot)的源码实现了自己的MCP机器人。在此感谢`keliwen@stu.pku.edu.cn`对开源社区的贡献。随后这个MCP机器人还被用于包的安装、包的验证等多个过程。 +3. 项目创新点: +- 结合Github和PyPI仓库的信息,更准确地获取软件包的详细信息。 +- 使用MCP工具以更智能的方式实现包的分析、安装和验证。 +- 通过分析PyPI包的安装过程及生成的文件,更准确地判断安装包过程中可能遇到的问题,比如环境依赖缺失、版本冲突、安装包名和导入包名不匹配等问题。 +- 提供系统软件管理工具来解决由于系统依赖缺失导致的安装/验证失败问题。 + +## 参考文献 +[1] Peng Y, Hu R, Wang R, et al. Less is more? an empirical study on configuration issues in python pypi ecosystem[C]//Proceedings of the IEEE/ACM 46th international conference on software engineering. 2024: 1-12. + +[2] MCP Specification: https://modelcontextprotocol.io/specification/2025-06-18 + +[3] mcp_chatbot: https://github.com/keli-wen/mcp_chatbot diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/assets/AI\350\275\257\344\273\266\350\207\252\345\212\250\351\252\214\350\257\201\345\267\245\345\205\267.png" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/assets/AI\350\275\257\344\273\266\350\207\252\345\212\250\351\252\214\350\257\201\345\267\245\345\205\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..03954efb83513417682f3e9af245d88c175bf17e Binary files /dev/null and "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/assets/AI\350\275\257\344\273\266\350\207\252\345\212\250\351\252\214\350\257\201\345\267\245\345\205\267.png" differ diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/ai_agent/llm_api.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/ai_agent/llm_api.py" new file mode 100644 index 0000000000000000000000000000000000000000..95435f4256dc8b5e47b600a707025b463a9dca94 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/ai_agent/llm_api.py" @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +import uuid +import json +import requests +import os +import sys + +CONFIG_FILE = os.path.join(os.path.dirname(__file__), '../config.json') + +def call_llm_api(prompt, + temperature=1.0, + top_p=1.0, + max_tokens=8192, + api_key=None, + verbose=False): + """ + 调用LLM API的函数 + + Args: + prompt (json): 用户输入的提示词 + temperature (float): 温度参数,控制输出的随机性,默认1.0 + top_p (float): top_p参数,控制nucleus采样,默认1.0 + max_tokens (int): 最大输出token数,默认32000 + api_key (str): API密钥,默认为None时从环境变量TAIJI_API_KEY读取 + verbose (bool): 是否打印详细信息,默认False + + Returns: + str: API返回的响应内容 + """ + + # 获取模型名称 + model = json.load(open(CONFIG_FILE, 'r'))['llm_model_name'] + + # 获取API密钥 + if api_key is None: + api_key = json.load(open(CONFIG_FILE, 'r'))['llm_access_token'] + + # API配置 + ss_url = json.load(open(CONFIG_FILE, 'r'))['llm_base_url'] + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + + # 构建请求数据 + json_data = { + "query_id": "query_id_" + str(uuid.uuid4()), + "model": model, + # "messages": [ + # {"role": "user", "content": prompt} + # ], + "messages": prompt, + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens, + "stream": False + } + + if verbose: + print('Input:\n{} | {} | {}'.format(ss_url, headers, json_data)) + + try: + resp = requests.post(ss_url, headers=headers, json=json_data) + + if verbose: + print(f'Output: {resp}') + + # 非流式输出处理 + if resp.status_code == 200: + response_data = resp.json() + if 'choices' in response_data and len(response_data['choices']) > 0: + content = response_data['choices'][0]['message']['content'] + return content + else: + return resp.text + else: + if verbose: + print(f"请求失败,状态码: {resp.status_code}") + return resp.text + + except Exception as e: + if verbose: + print(f"调用API时出错: {e}") + return f"错误: {str(e)}" + + +def read_file_content(file_path): + """ + 读取文件内容 + + Args: + file_path (str): 文件路径 + + Returns: + str: 文件内容,如果读取失败则返回None + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read().strip() + except Exception as e: + print(f"读取文件 {file_path} 失败: {e}") + return None + + +def main(): + """ + 主函数,用于测试API调用 + 支持从命令行参数指定输入文件 + """ + # 检查命令行参数 + if len(sys.argv) > 1: + file_path = sys.argv[1] + if os.path.exists(file_path): + print(f"=== 从文件读取输入: {file_path} ===") + test_prompt = read_file_content(file_path) + if test_prompt is None: + return + else: + print(f"文件不存在: {file_path}") + return + else: + # 默认测试用例 + test_prompt = "你是谁" + print("=== 使用默认测试输入 ===") + + print(f"输入内容: {test_prompt[:100]}{'...' if len(test_prompt) > 100 else ''}") + result = call_llm_api([{"role": "user", "content": test_prompt}], verbose=True) + print(f"\n结果: {result}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/config.json" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/config.json" new file mode 100644 index 0000000000000000000000000000000000000000..faff9d75ba5f3c419111385139afe22272b2955b --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/config.json" @@ -0,0 +1,18 @@ +{ + "github_access_token": "", + "llm_model_name": "deepseek-chat", + "llm_access_token": "", + "llm_base_url": "https://api.deepseek.com/chat/completions", + + "//": "数据存储的方式和位置,支持json文件保存和数据库保存", + "save_method": "db", + "json_file_path": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/package_manager/package_info.json", + "db_path": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/package_info.db", + "result_path": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/results", + + "//": "tmp目录位置,用于存放软件列表.xlsx以及临时文件", + "tmp_path": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/tmp", + + "//": "安装和验证时存放虚拟环境的路径,注意不要将项目运行时使用的虚拟环境放在这里,避免破坏项目的运行环境", + "venvs_path": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/venvs" +} \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_converter.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_converter.py" new file mode 100644 index 0000000000000000000000000000000000000000..549afeae703d2c37c5cc59f1d5e3057b16c46af1 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_converter.py" @@ -0,0 +1,145 @@ +import sys +import json +from pathlib import Path +from typing import Dict, Any, Optional + +root_dir = Path(__file__).parent.parent +sys.path.insert(0, str(root_dir)) +from data_manager.package_info import Package, PackageInfo + + +class PackageConverter: + """包信息格式转换器""" + + @staticmethod + def dict_to_model(data: Dict[str, Any]) -> Package: + """将字典转换为Package模型""" + # 处理info字段为字符串错误信息的情况 + if isinstance(data.get("info"), str): + # 如果info是字符串错误信息,创建一个全是空字符串的PackageInfo对象 + data_copy = data.copy() + data_copy["info"] = PackageInfo() # 使用默认值创建空的PackageInfo + return Package.model_validate(data_copy) + return Package.model_validate(data) + + @staticmethod + def model_to_dict(package: Package) -> Dict[str, Any]: + """将Package模型转换为字典""" + return package.model_dump() + + @staticmethod + def db_row_to_model(row: tuple) -> Package: + """将数据库行转换为Package模型""" + # 检查是否存在包信息 + if len(row) >= 10 and row[9]: # exists为True + package_info = PackageInfo( + dependency=json.loads(row[1]) if row[1] else [], + import_test_code=row[2] or "", + import_test_expected_result=row[3] or "", + function_test_code=row[4] or "", + function_test_expected_result=row[5] or "", + gpu_test_code=row[6] or "", + gpu_test_expected_result=row[7] or "", + verified=row[8] or "False" + ) + else: + # 创建空的PackageInfo对象而不是None + package_info = PackageInfo() + + return Package( + package_name=row[0], + info=package_info, + exists=row[9] if len(row) >= 10 else False + ) + + @staticmethod + def json_item_to_model(package_name: str, item_info: Dict[str, Any]) -> Package: + """将JSON项转换为Package模型""" + if item_info['exists'] == "True": + package_info = PackageInfo.model_validate(item_info) + else: + # 创建空的PackageInfo对象 + package_info = PackageInfo() + + return Package( + package_name = package_name, + info = package_info, + exists = True if item_info['exists'] == "True" else False + ) + +if __name__ == "__main__": + # 测试代码 + print("------ dict-to-model 1-------------") + sample_data = { + "package_name": "example-package", + "info": { + "dependency": ["numpy"], + "import_test_code": "import example", + "import_test_expected_result": "", + "function_test_code": "print(example.func())", + "function_test_expected_result": "42", + "gpu_test_code": "", + "gpu_test_expected_result": "", + "verified": "False" + }, + "exists": True + } + package1 = PackageConverter.dict_to_model(sample_data) + print(package1) + + print("------ dict-to-model 2 (not exists)-------------") + sample_data = { + "package_name": "numpy", + "info": "Not package found in PyPI", + "exists": False + } + package2 = PackageConverter.dict_to_model(sample_data) + print(package2) + + print("------ model-to-dict -------------") + package_dict = PackageConverter.model_to_dict(package2) + print(json.dumps(package_dict, indent=4)) + + print("------ db-row-to-model -------------") + # 测试存在的包 + if package1.exists and package1.info and isinstance(package1.info, PackageInfo): + db_row = ( + package1.package_name, + json.dumps(package1.info.dependency), + package1.info.import_test_code, + package1.info.import_test_expected_result, + package1.info.function_test_code, + package1.info.function_test_expected_result, + package1.info.gpu_test_code, + package1.info.gpu_test_expected_result, + package1.info.verified, + package1.exists, + ) + package_from_db = PackageConverter.db_row_to_model(db_row) + print("Existing package from DB:", package_from_db) + else: + print("Skipping db_row_to_model test for non-existent package") + + # 测试不存在的包的数据库行 + db_row_non_existent = ("non-existent-package", "[]", "", "", "", "", "", "", "False", False) + package_from_db_non_existent = PackageConverter.db_row_to_model(db_row_non_existent) + print("Non-existent package from DB:", package_from_db_non_existent) + + print("------ json-item-to-model -------------") + sample_data = { + "protobuf": { + "dependency": [], + "import_test_code": "from google import protobuf", + "import_test_expected_result": "", + "function_test_code": "import sys; from google import protobuf; result = protobuf.__version__; print(result)", + "function_test_expected_result": "^\\d+\\.\\d+(\\.\\d+)?.*$", + "gpu_test_code": "", + "gpu_test_expected_result": "", + "exists": "True" + } + } + name_and_info_to_model = PackageConverter.json_item_to_model( + "protobuf", + sample_data["protobuf"] + ) + print(name_and_info_to_model) diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_info.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_info.py" new file mode 100644 index 0000000000000000000000000000000000000000..ba1175b26fc8f0baa96a07826dd68f0c719fd102 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_info.py" @@ -0,0 +1,19 @@ +from typing import List, Optional, Union +from pydantic import BaseModel, Field + +class PackageInfo(BaseModel): + """包信息的数据模型""" + dependency: List[str] = Field(default_factory=list) + import_test_code: str = "" + import_test_expected_result: str = "" + function_test_code: str = "" + function_test_expected_result: str = "" + gpu_test_code: str = "" + gpu_test_expected_result: str = "" + verified: str = "False" + +class Package(BaseModel): + """完整的包记录模型""" + package_name: str + info: PackageInfo + exists: bool \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_repository.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_repository.py" new file mode 100644 index 0000000000000000000000000000000000000000..759089fec2af20a241734582bc178b63fac479d2 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/data_manager/package_repository.py" @@ -0,0 +1,153 @@ +import sqlite3 +import sys +import json +from typing import List, Optional +from pathlib import Path + +root_dir = Path(__file__).parent.parent +sys.path.insert(0, str(root_dir)) + +from data_manager.package_info import Package, PackageInfo +from data_manager.package_converter import PackageConverter + +class PackageRepository: + """包信息仓库""" + + def __init__(self, db_path: str, json_path: str): + self.db_path = db_path + self.json_path = json_path + + def get_certain_package_list_from_db(self, condition: Optional[str] = None) -> List[Package]: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + query = f"SELECT * FROM packages" + if condition: + query += f" WHERE {condition}" + cursor.execute(query) + rows = cursor.fetchall() + return [PackageConverter.db_row_to_model(row) for row in rows] + + def modify_package_in_db(self, package_name: str, column: str, new_value) -> None: + """修改数据库中某个包的指定字段""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + query = f"UPDATE packages SET {column} = ? WHERE package_name = ?" + cursor.execute(query, (new_value, package_name)) + conn.commit() + + def save_to_db(self, package: Package) -> None: + """保存包信息到数据库""" + with sqlite3.connect(self.db_path) as conn: + cursor = conn.cursor() + self._ensure_table_exists(cursor) + + info = package.info + cursor.execute(""" + INSERT INTO packages + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(package_name) DO UPDATE SET + package_name=excluded.package_name, + dependency=excluded.dependency, + import_test_code=excluded.import_test_code, + import_test_expected_result=excluded.import_test_expected_result, + function_test_code=excluded.function_test_code, + function_test_expected_result=excluded.function_test_expected_result, + gpu_test_code=excluded.gpu_test_code, + gpu_test_expected_result=excluded.gpu_test_expected_result, + verified=excluded.verified, + `exists`=excluded.`exists` + """, ( + package.package_name, + json.dumps(info.dependency), + info.import_test_code, + info.import_test_expected_result, + info.function_test_code, + info.function_test_expected_result, + info.gpu_test_code, + info.gpu_test_expected_result, + info.verified, + package.exists + )) + + def get_certain_package_list_from_json(self, attr: Optional[str] = None, value: Optional[str] = None) -> List[Package]: + packages = [] + with open(self.json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for pkg_name, item_info in data.items(): + if (attr is None or item_info.get(attr) == value): + package = PackageConverter.json_item_to_model(pkg_name, item_info) + packages.append(package) + return packages + + def modify_package_in_json(self, package_name: str, key: str, new_value) -> None: + """修改JSON文件中某个包的指定字段""" + try: + with open(self.json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + data = {} + + if package_name in data: + data[package_name][key] = new_value + with open(self.json_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + def save_to_json(self, package: Package) -> None: + """保存包信息到JSON文件""" + try: + with open(self.json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + data = {} + + package_dict = PackageConverter.model_to_dict(package) + # 将exists字段添加到info中 + item_info = package_dict['info'] + item_info['exists'] = package_dict['exists'] + data[package.package_name] = item_info + + with open(self.json_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + def _ensure_table_exists(self, cursor: sqlite3.Cursor) -> None: + """确保数据表存在""" + cursor.execute(""" + CREATE TABLE IF NOT EXISTS packages ( + package_name TEXT PRIMARY KEY, + dependency TEXT, + import_test_code TEXT, + import_test_expected_result TEXT, + function_test_code TEXT, + function_test_expected_result TEXT, + gpu_test_code TEXT, + gpu_test_expected_result TEXT, + verified TEXT, + `exists` BOOLEAN + ) + """) + +if __name__ == "__main__": + package_respository = PackageRepository("package_info_test.db", "package_info_test.json") + package1 = Package( + package_name="numpy", + info=PackageInfo( + dependency=["setuptools"], + import_test_code="import numpy as np", + import_test_expected_result="", + function_test_code="print(np.__version__)", + function_test_expected_result=r"\d+\.\d+\.\d+", + gpu_test_code="", + gpu_test_expected_result="", + verified="True" + ), + exists=True + ) + package_respository.save_to_db(package1) + package_respository.save_to_json(package1) + package2 = Package( + package_name = "abc", + info = PackageInfo(), + exists = False + ) + package_respository.save_to_db(package2) + package_respository.save_to_json(package2) diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_dependency.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_dependency.py" new file mode 100644 index 0000000000000000000000000000000000000000..74091c60755baaab12854b8ad6ccea38b604b7f9 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_dependency.py" @@ -0,0 +1,788 @@ +#!/usr/bin/env python3 +"""Resolve runtime dependency package names for a PyPI package using multiple sources. + +This tool combines: +- PyPI metadata (requires_dist) +- pip download -> METADATA/PKG-INFO parsing as a fallback +- repository inspection (pyproject.toml / setup.cfg / requirements.txt) + +It filters out optional extras, test/dev dependencies and any requires_dist entries +whose environment markers do not evaluate true for the current environment. + +Usage: python analyse_pypi2.py [--version X.Y] [--no-repo] +""" +from __future__ import annotations + +import argparse +import json +import os +import re +import shutil +import subprocess +import sys +import tempfile +from typing import Dict, List, Optional, Set + +import requests + +# Optional helpers from packaging +try: + from packaging.requirements import Requirement # type: ignore + from packaging.markers import default_environment # type: ignore +except Exception: + Requirement = None # type: ignore + default_environment = None # type: ignore + +try: + import tomllib as toml # Python 3.11+ +except Exception: + try: + import toml # type: ignore + except Exception: + toml = None # type: ignore + +PYPI_JSON = "https://pypi.org/pypi/{pkg}/json" +NAME_RE = re.compile(r"^\s*([A-Za-z0-9_\-\.]+)") + + +def fetch_pypi_json(package: str, version: Optional[str] = None, timeout: int = 10) -> Optional[Dict]: + try: + url = PYPI_JSON.format(pkg=package) if version is None else f"https://pypi.org/pypi/{package}/{version}/json" + r = requests.get(url, timeout=timeout) + r.raise_for_status() + return r.json() + except Exception: + return None + + +def _parse_requires_dist(entry: str): + """Return tuple (name, version_spec, extras_set, marker) or None on parse failure. + + Uses packaging.Requirement when available for robust parsing. + Returns version_spec as "Any" if no version constraints are found. + """ + if not entry: + return None + if Requirement: + try: + req = Requirement(entry) + name = req.name + extras = set(req.extras) if req.extras else set() + marker = req.marker # may be None + # Extract version specification + version_spec = str(req.specifier) if req.specifier else "Any" + return name, version_spec, extras, marker + except Exception: + pass + + # fallback simple parse + parts = entry.split(";", 1) + req_part = parts[0].strip() + marker = parts[1].strip() if len(parts) > 1 else None + + # Extract extras + extras = set() + if "[" in req_part and "]" in req_part: + before_bracket = req_part.split("[", 1)[0] + bracket_content = req_part.split("[", 1)[1].split("]", 1)[0] + req_part = before_bracket + (req_part.split("]", 1)[1] if "]" in req_part else "") + extras = set(e.strip() for e in bracket_content.split(",") if e.strip()) + + # Extract name and version specification + name_part = req_part + version_spec = "Any" + + # Look for version specifiers + version_pattern = r"([A-Za-z0-9_\-\.]+)\s*([><=!~,\s\d\.]+)" + match = re.match(version_pattern, req_part) + if match: + name_part = match.group(1) + version_spec = match.group(2).strip() + else: + # Simple name extraction + m = NAME_RE.match(req_part) + if m: + name_part = m.group(1) + + if not name_part: + return None + + return name_part, version_spec, extras, marker + + +def requires_dist_filtered(package: str, version: Optional[str] = None, include_extras: bool = False) -> List[str]: + """Get dependency info from PyPI requires_dist while filtering extras and markers. + + Returns list of strings in format 'package_name version_spec' or 'package_name Any' + """ + meta = fetch_pypi_json(package, version) + if not meta: + return [] + requires = meta.get("info", {}).get("requires_dist") or [] + env = default_environment() if default_environment else None + seen: Set[str] = set() + result: List[str] = [] + + for entry in requires: + parsed = _parse_requires_dist(entry) + if not parsed: + continue + name, version_spec, extras, marker = parsed + + # skip optional extras unless requested + if extras and not include_extras: + continue + # skip markers that reference 'extra' unless including extras + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + continue + # evaluate marker in current environment + if marker and env is not None: + try: + if not marker.evaluate(env): + continue + except Exception: + # conservative: skip if marker cannot be evaluated + continue + + if name and name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + result.append(dependency_spec) + return result + + +def _extract_metadata_requires_from_wheel(wheel_path: str, include_extras: bool = False) -> List[str]: + import zipfile + + deps: List[str] = [] + seen: Set[str] = set() + with zipfile.ZipFile(wheel_path) as z: + for f in z.namelist(): + if f.endswith("/METADATA") or f.endswith(".dist-info/METADATA"): + with z.open(f) as fh: + content = fh.read().decode(errors="ignore") + for line in content.splitlines(): + if line.startswith("Requires-Dist:"): + entry = line.split(":", 1)[1].strip() + parsed = _parse_requires_dist(entry) + if not parsed: + continue + name, version_spec, extras, marker = parsed + + # skip optional extras unless requested + if extras and not include_extras: + continue + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + continue + env = default_environment() if default_environment else None + if marker and env is not None: + try: + if not marker.evaluate(env): + continue + except Exception: + continue + if name and name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + deps.append(dependency_spec) + return deps + + +def _extract_metadata_requires_from_sdist(sdist_path: str, include_extras: bool = False) -> List[str]: + deps: List[str] = [] + seen: Set[str] = set() + + def process_line(line: str): + if line.startswith("Requires-Dist:") or line.startswith("Requires:"): + entry = line.split(":", 1)[1].strip() + parsed = _parse_requires_dist(entry) + if not parsed: + return + name, version_spec, extras, marker = parsed + if extras and not include_extras: + return + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + return + env = default_environment() if default_environment else None + if marker and env is not None: + try: + if not marker.evaluate(env): + return + except Exception: + return + if name and name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + deps.append(dependency_spec) + + if sdist_path.endswith((".zip", ".whl")): + import zipfile + + with zipfile.ZipFile(sdist_path) as z: + for n in z.namelist(): + if n.endswith("PKG-INFO") or n.endswith("METADATA"): + with z.open(n) as fh: + content = fh.read().decode(errors="ignore") + for line in content.splitlines(): + process_line(line) + else: + import tarfile + + with tarfile.open(sdist_path, "r:*") as tar: + members = [m for m in tar.getmembers() if m.name.endswith("PKG-INFO") or m.name.endswith("METADATA")] + for m in members: + f = tar.extractfile(m) + if not f: + continue + content = f.read().decode(errors="ignore") + for line in content.splitlines(): + process_line(line) + return deps + + +def pip_download_metadata(package: str, version: Optional[str] = None, include_extras: bool = False) -> List[str]: + """Use pip download --no-deps and parse artifacts for Requires-Dist as fallback.""" + tmpdir = tempfile.mkdtemp(prefix="pypi_dl_") + try: + pkg_spec = f"{package}=={version}" if version else package + cmd = [sys.executable, "-m", "pip", "download", "--no-deps", "--dest", tmpdir, pkg_spec] + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + if proc.returncode != 0: + return [] + files = os.listdir(tmpdir) + deps: List[str] = [] + for fname in files: + path = os.path.join(tmpdir, fname) + if fname.endswith(".whl"): + deps.extend(_extract_metadata_requires_from_wheel(path, include_extras=include_extras)) + elif fname.endswith((".tar.gz", ".zip", ".tar.bz2", ".tar")): + deps.extend(_extract_metadata_requires_from_sdist(path, include_extras=include_extras)) + + # dedupe preserving order + seen: Set[str] = set() + out: List[str] = [] + for dep_spec in deps: + # Extract package name for deduplication (before first space) + dep_name = dep_spec.split()[0] + if dep_name not in seen: + seen.add(dep_name) + out.append(dep_spec) + return out + except Exception: + return [] + finally: + shutil.rmtree(tmpdir, ignore_errors=True) + + +def _fetch_raw(url: str, timeout: int = 10) -> Optional[str]: + try: + r = requests.get(url, timeout=timeout) + if r.status_code == 200: + return r.text + except Exception: + pass + return None + + +def repo_inspect_from_homepage(home_url: str, include_extras: bool = False) -> List[str]: + """Best-effort: if homepage is a GitHub repo, fetch common files to extract deps.""" + if not home_url: + return [] + m = re.search(r"github\.com[:/]+([^/]+)/([^/]+)(?:/|$)", home_url) + if not m: + return [] + owner, repo = m.group(1), m.group(2).rstrip(".git") + candidates = [ + f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/pyproject.toml", + f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/setup.cfg", + f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/setup.py", + f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/requirements.txt", + ] + results: List[str] = [] + seen: Set[str] = set() + env = default_environment() if default_environment else None + + for url in candidates: + txt = _fetch_raw(url) + if not txt: + continue + if url.endswith("pyproject.toml") and toml: + try: + data = toml.loads(txt) + deps: List[str] = [] + proj = data.get("project") if isinstance(data, dict) else None + if not proj: + # poetry support + tool = data.get("tool", {}) + poetry = tool.get("poetry") if isinstance(tool, dict) else None + proj = poetry + if proj: + if isinstance(proj.get("dependencies"), dict): + for k, v in proj["dependencies"].items(): + if k not in seen: + seen.add(k) + version_spec = str(v) if v != "python" and v else "Any" + dependency_spec = f"{k} {version_spec if version_spec != k else 'Any'}" + results.append(dependency_spec) + elif isinstance(proj.get("dependencies"), list): + for entry in proj["dependencies"]: + parsed = _parse_requires_dist(entry) if isinstance(entry, str) else None + if not parsed: + continue + name, version_spec, extras, marker = parsed + if extras and not include_extras: + continue + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + continue + if marker and env is not None: + try: + if not marker.evaluate(env): + continue + except Exception: + continue + if name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + results.append(dependency_spec) + continue + except Exception: + pass + if url.endswith("setup.cfg"): + try: + import configparser + + cfg = configparser.ConfigParser() + cfg.read_string(txt) + if cfg.has_section("options") and cfg.has_option("options", "install_requires"): + raw = cfg.get("options", "install_requires") + for line in raw.splitlines(): + line = line.strip() + if not line: + continue + # attempt parse using packaging if available + parsed = _parse_requires_dist(line) + if not parsed: + continue + name, version_spec, extras, marker = parsed + if extras and not include_extras: + continue + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + continue + if marker and env is not None: + try: + if not marker.evaluate(env): + continue + except Exception: + continue + if name and name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + results.append(dependency_spec) + continue + except Exception: + pass + if url.endswith("requirements.txt"): + lines = txt.splitlines() + for line in lines: + line = line.strip() + if not line or line.startswith("#"): + continue + # Parse using _parse_requires_dist for version info + parsed = _parse_requires_dist(line) + if parsed: + name, version_spec, extras, marker = parsed + if name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + results.append(dependency_spec) + else: + # fallback: strip extras and specifiers for name only + if "[" in line: + line = line.split("[", 1)[0] + original_line = line + for sep in ("==", ">=", "<=", "~=", ">", "<", "!="): + if sep in line: + line = line.split(sep, 1)[0].strip() + break + mname = NAME_RE.match(line) + if mname: + n = mname.group(1) + if n not in seen: + seen.add(n) + # Try to extract version from original line + version_spec = "Any" + for sep in ("==", ">=", "<=", "~=", ">", "<", "!="): + if sep in original_line: + version_spec = original_line.split(sep, 1)[1].strip() + version_spec = sep + version_spec + break + dependency_spec = f"{n} {version_spec}" + results.append(dependency_spec) + continue + if url.endswith("setup.py"): + # naive parse: look for install_requires = [ ... ] + mlist = re.search(r"install_requires\s*=\s*\[([^\]]+)\]", txt, re.S) + if mlist: + block = mlist.group(1) + parts = re.findall(r"['\"]([^'\"]+)['\"]", block) + for p in parts: + parsed = _parse_requires_dist(p) + if not parsed: + continue + name, version_spec, extras, marker = parsed + if extras and not include_extras: + continue + marker_str = str(marker) if marker is not None else "" + if "extra" in marker_str and not include_extras: + continue + if marker and env is not None: + try: + if not marker.evaluate(env): + continue + except Exception: + continue + if name and name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + results.append(dependency_spec) + # attempt referenced requirements + mreq = re.search(r"requirements(?:.*?)\.txt", txt) + if mreq: + req_url = f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/requirements.txt" + txt2 = _fetch_raw(req_url) + if txt2: + for line in txt2.splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + parsed = _parse_requires_dist(line) + if parsed: + name, version_spec, extras, marker = parsed + if name not in seen: + seen.add(name) + dependency_spec = f"{name} {version_spec}" + results.append(dependency_spec) + else: + # fallback + if "[" in line: + line = line.split("[", 1)[0] + original_line = line + for sep in ("==", ">=", "<=", "~=", ">", "<", "!="): + if sep in line: + line = line.split(sep, 1)[0].strip() + break + mname = NAME_RE.match(line) + if mname: + n = mname.group(1) + if n not in seen: + seen.add(n) + version_spec = "Any" + for sep in ("==", ">=", "<=", "~=", ">", "<", "!="): + if sep in original_line: + version_spec = original_line.split(sep, 1)[1].strip() + version_spec = sep + version_spec + break + dependency_spec = f"{n} {version_spec}" + results.append(dependency_spec) + return results + + +def find_dependency_for_pip_package(package: str, version: Optional[str] = None, include_extras: bool = False, include_repo: bool = True) -> List[str]: + final: List[str] = [] + seen: Set[str] = set() + + # 定义常见的非PyPI包(标准库和系统包) + non_pypi_packages = { + # Python 标准库模块 + 'sys', 'os', 'json', 're', 'math', 'time', 'datetime', 'collections', + 'itertools', 'functools', 'operator', 'copy', 'pickle', 'sqlite3', + 'threading', 'multiprocessing', 'subprocess', 'socket', 'urllib', + 'http', 'email', 'xml', 'html', 'hashlib', 'base64', 'uuid', + 'logging', 'warnings', 'traceback', 'inspect', 'types', 'typing', + 'pathlib', 'glob', 'shutil', 'tempfile', 'io', 'argparse', + 'configparser', 'csv', 'gzip', 'zipfile', 'tarfile', 'zlib', + 'unittest', 'doctest', 'pdb', 'profile', 'cProfile', 'timeit', + 'gc', 'weakref', 'ctypes', 'struct', 'array', 'bisect', 'heapq', + 'random', 'statistics', 'decimal', 'fractions', 'cmath', + + # 系统和特殊包 + 'python', 'python3', 'pip', 'setuptools', 'wheel', 'distutils', + 'pkg_resources', 'site', 'sysconfig', + + # 常见的虚拟包或元包 + 'win32', 'win32api', 'win32com', 'winerror', 'msvcrt', + 'posix', 'nt', 'pwd', 'grp', 'termios', 'tty', 'pty', + + # 其他常见的非安装包 + 'builtins', '__builtin__', '__future__', '__main__', + } + + # 1. PyPI metadata (filtered) + pypi_list = requires_dist_filtered(package, version, include_extras=include_extras) + for dep in pypi_list: + # dep is now a string "name version_spec" + name = dep.split()[0].lower() + if name not in seen: + seen.add(name) + final.append(dep) + + # 2. pip download metadata (fallback) + pip_list = pip_download_metadata(package, version, include_extras=include_extras) + for dep in pip_list: + # dep is now a string "name version_spec" + name = dep.split()[0].lower() + if name not in seen: + seen.add(name) + final.append(dep) + + # 3. repository inspection (use project_urls/home_page from PyPI) + if include_repo: + meta = fetch_pypi_json(package, version) + home = None + if meta: + info = meta.get("info", {}) or {} + project_urls = info.get("project_urls") or {} + if isinstance(project_urls, dict): + home = project_urls.get("Source") or project_urls.get("Homepage") or info.get("home_page") + else: + home = info.get("home_page") + if home: + repolist = repo_inspect_from_homepage(home, include_extras=include_extras) + for dep in repolist: + # dep is now a string "name version_spec" + name = dep.split()[0].lower() + if name not in seen: + seen.add(name) + final.append(dep) + + # 过滤结果:删除自身和非PyPI包 + filtered_final = [] + package_name_lower = package.lower() + + for dep in final: + dep_name = dep.split()[0].lower() + + # 跳过自身 + if dep_name == package_name_lower: + continue + + # 跳过常见的非PyPI包 + if dep_name in non_pypi_packages: + continue + + # 跳过包名中包含常见非PyPI特征的包 + if (dep_name.startswith('__') and dep_name.endswith('__')) or \ + dep_name in ['python2', 'python3'] or \ + dep_name.startswith('python-') and dep_name.endswith('-dev'): + continue + + filtered_final.append(dep) + + return filtered_final + + +GPU_KEYWORDS = [ + "gpu", + "cuda", + "nvidia", + "cudnn", + "cublas", + "rocm", + "tensorrt", + "cupy", + "gpu-accelerated", + "cuda-toolkit", +] + +README_CANDIDATES = [ + 'README.md', 'readme.md', 'README.rst', + 'readme.rst', 'README.txt', 'README', + 'README.MD' +] + +def detect_gpu_requirement(package: str, version: Optional[str] = None, include_repo: bool = True) -> Dict: + """Return a dict with GPU detection: {'gpu': bool, 'matches': [keywords], 'sources': [which fields matched] }. + + The detector inspects PyPI metadata (summary, description, keywords, classifiers) and, + if requested, attempts to fetch the repository README for additional hints. + """ + matches: Set[str] = set() + sources: Set[str] = set() + meta = fetch_pypi_json(package, version) + + def _scan_text(name: str, text: Optional[str]): + if not text: + return + t = text.lower() + for kw in GPU_KEYWORDS: + if kw in t: + matches.add(kw) + sources.add(name) + + if meta: + info = meta.get("info", {}) or {} + _scan_text("summary", info.get("summary")) + _scan_text("description", info.get("description")) + # keywords may be a space/comma separated string + kws = info.get("keywords") + if isinstance(kws, str): + _scan_text("keywords", kws) + # classifiers + classifiers = info.get("classifiers") or [] + if classifiers: + _scan_text("classifiers", "\n".join(classifiers)) + + # optional: fetch README from GitHub if homepage points there + if include_repo and meta: + info = meta.get("info", {}) or {} + project_urls = info.get("project_urls") or {} + home = None + if isinstance(project_urls, dict): + home = project_urls.get("Source") or project_urls.get("Homepage") or info.get("home_page") + else: + home = info.get("home_page") + if home and re.search(r"github\.com[:/]+([^/]+)/([^/]+)(?:/|$)", home): + # try common README locations + m = re.search(r"github\.com[:/]+([^/]+)/([^/]+)(?:/|$)", home) + owner, repo = m.group(1), m.group(2).rstrip(".git") + for readme_name in README_CANDIDATES: + url = f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/{readme_name}" + txt = _fetch_raw(url) + if txt: + _scan_text(f"readme:{readme_name}", txt) + # stop early if we already found matches + if matches: + break + + return {"gpu": bool(matches), "matches": sorted(matches), "sources": sorted(sources)} + + +def detect_gpu_level(package: str, version: Optional[str] = None, include_repo: bool = True) -> Dict: + """Classify GPU requirement into levels: 'required', 'optional', 'unknown', or 'none'. + + Heuristics: + - 'required' if requires_dist lists a GPU-related package (name matches GPU keywords), + or description/classifiers contain strong 'requires' phrasing near GPU keywords. + - 'optional' if requires_dist contains extras referencing GPU or README/description mentions + 'optional' near GPU keywords. + - 'unknown' if GPU keywords exist in metadata/README but no clear required/optional evidence. + - 'none' if no GPU keywords found. + """ + meta = fetch_pypi_json(package, version) + found_keywords = set() + required_evidence = False + optional_evidence = False + + # helper for phrase checks + def _has_require_phrase(text: str) -> bool: + return bool(re.search(r"\b(require|requires|required|need|needs)\b.{0,40}\b(?:" + "|".join(GPU_KEYWORDS) + r")\b", text, flags=re.I)) + + def _has_optional_phrase(text: str) -> bool: + return bool(re.search(r"\b(optional|optionally|support for|supports)\b.{0,40}\b(?:" + "|".join(GPU_KEYWORDS) + r")\b", text, flags=re.I)) + + if meta: + info = meta.get("info", {}) or {} + # scan requires_dist entries + requires = info.get("requires_dist") or [] + for entry in requires: + parsed = _parse_requires_dist(entry) + if not parsed: + continue + name, version_spec, extras, marker = parsed + lname = (name or "").lower() + # direct dependency on a gpu-related package -> required + for kw in GPU_KEYWORDS: + if kw in lname: + required_evidence = True + found_keywords.add(kw) + # extras mentioning gpu -> optional + for ex in extras: + lex = ex.lower() + for kw in GPU_KEYWORDS: + if kw in lex: + optional_evidence = True + found_keywords.add(kw) + + # scan summary/description/classifiers/keywords + summary = (info.get("summary") or "") + desc = (info.get("description") or "") + classifiers = "\n".join(info.get("classifiers") or []) + kws = info.get("keywords") or "" + for txt, src in ((summary, "summary"), (desc, "description"), (classifiers, "classifiers"), (kws, "keywords")): + if not txt: + continue + tl = txt.lower() + for kw in GPU_KEYWORDS: + if kw in tl: + found_keywords.add(kw) + if _has_require_phrase(txt): + required_evidence = True + if _has_optional_phrase(txt): + optional_evidence = True + + # optional README scan via repo if requested + if include_repo and meta: + info = meta.get("info", {}) or {} + project_urls = info.get("project_urls") or {} + home = None + if isinstance(project_urls, dict): + home = project_urls.get("Source") or project_urls.get("Homepage") or info.get("home_page") + else: + home = info.get("home_page") + if home and re.search(r"github\.com[:/]+([^/]+)/([^/]+)(?:/|$)", home): + m = re.search(r"github\.com[:/]+([^/]+)/([^/]+)(?:/|$)", home) + owner, repo = m.group(1), m.group(2).rstrip(".git") + for readme_name in README_CANDIDATES: + url = f"https://raw.githubusercontent.com/{owner}/{repo}/HEAD/{readme_name}" + txt = _fetch_raw(url) + if not txt: + continue + t = txt.lower() + for kw in GPU_KEYWORDS: + if kw in t: + found_keywords.add(kw) + if _has_require_phrase(txt): + required_evidence = True + if _has_optional_phrase(txt): + optional_evidence = True + # stop early if we found required evidence + if required_evidence: + break + + # decide level + if required_evidence: + level = "required" + elif optional_evidence: + level = "optional" + elif found_keywords: + level = "unknown" + else: + level = "none" + + return {"level": level, "keywords": sorted(found_keywords), "required_evidence": required_evidence, "optional_evidence": optional_evidence} + + +def main() -> None: + parser = argparse.ArgumentParser(description="Resolve runtime dependency package names for a PyPI package.") + parser.add_argument("package", help="PyPI package name") + parser.add_argument("--version", help="Specific version (optional)", default=None) + parser.add_argument("--include-extras", action="store_true", help="Include extras optional dependencies") + parser.add_argument("--no-repo", action="store_true", help="Do not attempt repository inspection") + args = parser.parse_args() + + deps = find_dependency_for_pip_package(args.package, args.version, include_extras=args.include_extras, include_repo=not args.no_repo) + gpu_info = detect_gpu_requirement(args.package, args.version, include_repo=not args.no_repo) + gpu_level = detect_gpu_level(args.package, args.version, include_repo=not args.no_repo) + # combine into a single gpu object + gpu_obj = {**gpu_info, "classification": gpu_level} + out = {"package": args.package, "version": args.version, "dependencies": deps, "gpu": gpu_obj} + print(json.dumps(out, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_pypi.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_pypi.py" new file mode 100644 index 0000000000000000000000000000000000000000..94f5aeceee6f0c33fe3efa7d3235845329a1fcc7 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/analyse_pypi.py" @@ -0,0 +1,172 @@ +import argparse +import json +from pathlib import Path +from typing import List, Dict, Optional, Tuple +import requests +import sys +import re + +root_dir = Path(__file__).parent.parent +sys.path.insert(0, str(root_dir)) + +from info_crawler.github_tools import list_tree, fetch_file + +PYPI_API_TEMPLATE = "https://pypi.org/pypi/{name}/json" + +def _fetch_pypi_json(package: str, version: Optional[str] = None, timeout: int = 10): + url = PYPI_API_TEMPLATE.format(name=package) if version is None else PYPI_API_TEMPLATE.format(name=f"{package}/{version}") + return requests.get(url, timeout=timeout) + +def check_pypi(candidate_package_name: str, version: Optional[str] = None) -> Tuple[bool, Optional[Dict]]: + """Check a single candidate package name on PyPI. If exists, return (True, package info), else (False, None).""" + r = _fetch_pypi_json(candidate_package_name, version) + result = True if r.status_code == 200 else False + return result, r.json() if result else None + +def check_pypi_list(candidate_package_names: List[str], version: Optional[str] = None) -> Dict[str, {bool, List[str]}]: + """Check candidate package names on PyPI. Returns map name->exists.""" + result = {} + for name in candidate_package_names: + result[name], _ = check_pypi(name, version) + return result + +README_CANDIDATES = [ + 'README.md', 'readme.md', 'README.rst', + 'readme.rst', 'README.txt', 'README', + 'README.MD' +] + +# 简化的pip包名匹配模式 - 只提取标准PyPI包名 +PIP_PATTERN = re.compile( + r'pip\s+install\s+' # 基础命令 + r'(?:--?\w+(?:\s+[^\s-]+)?\s+)*' # 跳过各种参数 + r'(?:-[a-zA-Z]\s+)*' # 跳过短参数 + r"(?:['\"])?([a-zA-Z0-9][a-zA-Z0-9\-_\.]*(?:\[[^\]]+\])?)(?:['\"])?" # 捕获包名(支持引号和方括号) + r'(?:[><=!~]=?[\d\w\.\-\+]+)*', # 可选版本约束 + re.IGNORECASE +) + +def _parse_pip_commands(text: str) -> list: + """解析pip安装命令,提取标准PyPI包名""" + packages = [] + + # 按行处理,跳过包含特殊安装方式的行 + lines = text.split('\n') + for line in lines: + # 跳过包含这些模式的行:requirements文件、本地安装、git仓库 + if any(pattern in line.lower() for pattern in ['-r ', 'git+', 'install .', 'install -e']): + continue + + # 匹配标准包名 + for match in PIP_PATTERN.finditer(line): + package = match.group(1).strip() + + # 验证包名有效性 + if (package and len(package) > 1 and + not package.endswith('.txt') and + not package.startswith('.') and + not package.startswith('http') and # 排除URL + not package.lower() == 'pip' and # 排除--upgrade pip + not package.lower() == 'poetry' and # 排除安装poetry + not package.lower() == 'uv'): # 排除安装uv + packages.append(package) + + # 去重并保持顺序 + unique_packages = [] + seen = set() + for pkg in packages: + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg not in seen: + unique_packages.append(pkg) + seen.add(base_pkg) + return unique_packages + +def _read_project_name_from_readme(url: str, file_list: List[str]) -> Optional[str]: + for readme_name in README_CANDIDATES: + # find any path in file_list that contains the filename as a substring + match = next((f for f in file_list if readme_name in f), None) + if match: + content = fetch_file(url, match) + if content: + package_list = _parse_pip_commands(content) + if package_list: + return package_list[0] + return None + +def _read_project_name_from_pyproject(url: str, file_list: List[str]) -> Optional[str]: + for p in ['pyproject.toml']: + # find any path in file_list that contains the filename as a substring + match = next((f for f in file_list if p in f), None) + if match: + content = fetch_file(url, match) + if content: + m = re.search(r'name\s*=\s*"([^"]+)"', content) + if m: + return m.group(1) + return None + +def _read_project_name_from_setup_cfg(url: str, file_list: List[str]) -> Optional[str]: + match = next((f for f in file_list if 'setup.cfg' in f), None) + if match: + content = fetch_file(url, match) + if content: + m = re.search(r'^name\s*=\s*(.+)$', content, re.MULTILINE) + if m: + return m.group(1).strip() + return None + +def _read_project_name_from_setup_py(url: str, file_list: List[str]) -> Optional[str]: + match = next((f for f in file_list if 'setup.py' in f), None) + if match: + content = fetch_file(url, match) + if content: + m = re.search(r'name\s*=\s*["\']([^"\']+)["\']', content) + if m: + return m.group(1) + return None + +def detect_candidate_package_names(url: str, repo: str, file_list: List[str]) -> List[str]: + """Try to determine likely PyPI package name(s).""" + names = [] + # explicit files + for fn in (_read_project_name_from_pyproject, _read_project_name_from_setup_cfg, _read_project_name_from_setup_py, _read_project_name_from_readme): + try: + name = fn(url, file_list) + if name: + names.append(name) + except Exception: + continue + # fallback to repo name + if repo and repo not in names: + names.append(repo) + # also consider normalized names (replace _ with -) + for n in list(names): + alt = n.replace('_', '-') + if alt not in names: + names.append(alt) + print(f"Detected candidate package names: {names}") + return names + +def main(): + """Main function to analyze a GitHub repository.""" + repo_url = input("Enter GitHub repository URL: ").strip() + branch = input("Enter branch name (default: HEAD): ").strip() or "HEAD" + + # Fetch file list from the repository + file_list = list_tree(repo_url, branch) + if not file_list: + print("No files found in the repository.") + return + + # Detect candidate package names + repo_name = repo_url.split('/')[-1] # Extract repo name from URL + candidate_package_names = detect_candidate_package_names(repo_url, repo_name, file_list) + + # Check PyPI for these names + pypi_check = check_pypi_list(candidate_package_names) + + print("Candidate package names:", candidate_package_names) + print("PyPI check results:", pypi_check) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_abstracts_from_github.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_abstracts_from_github.py" new file mode 100644 index 0000000000000000000000000000000000000000..f0ccd623e9cc4a5b84e2578fe416f16e2b3c93e2 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_abstracts_from_github.py" @@ -0,0 +1,74 @@ +import argparse +from pathlib import Path +import requests +import csv +import json + +# -------------------- 配置 -------------------- +root_dir = Path(__file__).parent.parent +CONFIG_FILE = f"{root_dir}/config.json" +API_BASE = "https://api.github.com" +TOKEN = json.load(open(CONFIG_FILE, 'r'))['github_access_token'] +HEADERS = { + "Authorization": f"Bearer {TOKEN}", + "Accept": "application/vnd.github+json" +} +# --------------------------------------------- + +# CSV配置(新增description字段)[2,4](@ref) +CSV_FILENAME = f"{root_dir}/tmp/github_repos_with_desc.csv" +CSV_HEADERS = ["name", "url", "language", "stars", "description", "updated_at"] + +def fetch_repositories(topic: str = "ai") : + """仅获取前1000条数据""" + # 查询参数(按题+星数筛选)[1,2](@ref) + query_params = { + "q": f"topic:{topic} stars:>=1000", + "sort": "stars", + "order": "desc", + "per_page": 100 # 每页最大返回数 + } + max_pages = 10 # 1000/100=10页 + repos = [] + for page in range(1, max_pages + 1): + print("抓取第 {} 页...".format(page)) + url = f"{API_BASE}/search/repositories?page={page}" + response = requests.get(url, headers=HEADERS, params=query_params) + if response.status_code != 200: + print(f"请求失败: {response.status_code}") + break + data = response.json() + repos.extend(data["items"]) + # 提前终止条件(实际结果数不足1000) + if len(data["items"]) < query_params["per_page"]: + break + return repos + +def save_to_csv(repos, result_file_name): + """写入CSV文件,处理描述为空的情况""" + with open(result_file_name, "w", newline="", encoding="utf-8") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=CSV_HEADERS) + writer.writeheader() + for repo in repos: + # 处理description字段的空值[2](@ref) + description = repo["description"].strip() if repo["description"] else "未填写" + writer.writerow({ + "name": repo["name"], + "url": repo["html_url"], + "language": repo["language"] if repo["language"] else "未标注", + "stars": repo["stargazers_count"], + "description": description, + "updated_at": repo["updated_at"] + }) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='抓取GitHub指定Topic的热门仓库') + parser.add_argument('--topic', type=str, default='ai', help='要抓取的GitHub话题') + args = parser.parse_args() + repositories = fetch_repositories(args.topic) + with open(CONFIG_FILE, 'r') as f: + config = json.load(f) + tmp_path = config.get('tmp_path', f"{root_dir}/tmp") + output = f'{tmp_path}/github_{args.topic}_repos_with_desc.csv' + save_to_csv(repositories, output) + print(f"成功导出 {len(repositories)} 条数据至 {output}") \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_pypi_name.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_pypi_name.py" new file mode 100644 index 0000000000000000000000000000000000000000..2bf693bf790b329ba97166e077ad3c17c185abc3 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/get_pypi_name.py" @@ -0,0 +1,664 @@ +import argparse +import re +import requests +import base64 +import json +import time +import sqlite3 +import os +import sys +from urllib.parse import urlparse +from typing import Tuple, Optional +from datetime import datetime +from pathlib import Path + +root_dir = Path(__file__).parent.parent +sys.path.insert(0, str(root_dir)) +from ai_agent.llm_api import call_llm_api + +class ReadmeAnalyzer: + """GitHub仓库README分析工具""" + + README_CANDIDATES = [ + 'README.md', 'readme.md', 'README.rst', + 'readme.rst', 'README.txt', 'README', + 'README.MD' + ] + + # 简化的pip包名匹配模式 - 只提取标准PyPI包名 + PIP_PATTERN = re.compile( + r'pip\s+install\s+' # 基础命令 + r'(?:--?\w+(?:\s+[^\s-]+)?\s+)*' # 跳过各种参数 + r'(?:-[a-zA-Z]\s+)*' # 跳过短参数 + r"(?:['\"])?([a-zA-Z0-9][a-zA-Z0-9\-_\.]*(?:\[[^\]]+\])?)(?:['\"])?" # 捕获包名(支持引号和方括号) + r'(?:[><=!~]=?[\d\w\.\-\+]+)*', # 可选版本约束 + re.IGNORECASE + ) + + # 添加频率控制相关的类变量 + _last_llm_call_time = 0 + _qpm_interval = 60.0 # 默认每分钟1次请求的间隔 + + @classmethod + def extract_repo_info(cls, url: str) -> Tuple[Optional[str], Optional[str]]: + """解析GitHub仓库信息(基于网页8的URL处理逻辑)""" + parsed = urlparse(url) + path_segments = parsed.path.strip('/').split('/') + return (path_segments[0], path_segments[1]) if len(path_segments)>=2 else (None, None) + + @classmethod + def fetch_readme(cls, owner: str, repo: str, token: str = None) -> Optional[str]: + """获取README内容(支持多格式,参考网页3的.rst处理)""" + headers = {'Authorization': f'token {token}'} if token else {} + + for readme_name in cls.README_CANDIDATES: + try: + response = requests.get( + f"https://api.github.com/repos/{owner}/{repo}/contents/{readme_name}", + headers=headers, + timeout=10 + ) + if response.status_code == 200: + print(f"{owner}/{repo} 仓库的readme是{readme_name}") + return base64.b64decode(response.json()['content']).decode('utf-8') + except Exception as e: + print(f"Error fetching {readme_name}: {str(e)}") + + return None + + @classmethod + def parse_pip_commands(cls, text: str) -> list: + """解析pip安装命令,提取标准PyPI包名""" + packages = [] + + # 按行处理,跳过包含特殊安装方式的行 + lines = text.split('\n') + for line in lines: + # 跳过包含这些模式的行:requirements文件、本地安装、git仓库 + if any(pattern in line.lower() for pattern in ['-r ', 'git+', 'install .', 'install -e']): + continue + + # 匹配标准包名 + for match in cls.PIP_PATTERN.finditer(line): + package = match.group(1).strip() + + # 验证包名有效性 + if (package and len(package) > 1 and + not package.endswith('.txt') and + not package.startswith('.') and + not package.startswith('http') and # 排除URL + not package.lower() == 'pip' and # 排除--upgrade pip + not package.lower() == 'poetry' and # 排除安装poetry + not package.lower() == 'uv'): # 排除安装uv + packages.append(package) + + # 去重并保持顺序 + unique_packages = [] + seen = set() + for pkg in packages: + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg not in seen: + unique_packages.append(pkg) + seen.add(base_pkg) + + print(f"找到 {len(unique_packages)} 个PyPI包: {unique_packages}") + return unique_packages + + @classmethod + def analyze_with_llm(cls, readme_content: str, repo_name: str, qpm: int = 1) -> list: + """使用LLM分析README中的pip安装目标(单个仓库)""" + result = cls.analyze_batch_with_llm([{ + 'repo_name': repo_name, + 'readme_content': readme_content + }], qpm) + return result.get(repo_name, []) + + @classmethod + def analyze_batch_with_llm(cls, repo_data_list: list, qpm: int = 1) -> dict: + """使用LLM批量分析多个README中的pip安装目标 + + Args: + repo_data_list: 列表,每个元素是 {'repo_name': str, 'readme_content': str} + qpm: 每分钟请求次数限制 + + Returns: + dict: {repo_name: [packages_list], ...} + """ + # 频率控制:根据QPM限制请求频率 + interval = 60.0 / qpm # 计算每次请求的间隔(秒) + + # 如果不是第一次调用,需要等待 + if cls._last_llm_call_time > 0: + current_time = time.time() + time_since_last_call = current_time - cls._last_llm_call_time + if time_since_last_call < interval: + sleep_time = interval - time_since_last_call + print(f"QPM限制:等待 {sleep_time:.2f} 秒...") + time.sleep(sleep_time) + + # 构建批量分析的prompt + repos_section = "" + for i, repo_data in enumerate(repo_data_list, 1): + repos_section += f""" +=== 仓库 {i}: {repo_data['repo_name']} === +{repo_data['readme_content']} + +""" + + prompt = f""" +请分析以下多个GitHub仓库的README文档,为每个仓库分别提取所有pip install命令中的标准PyPI包名。 + +{repos_section} + +请按以下要求分析: +1. 找出每个仓库README中的所有pip install命令 +2. 只提取标准PyPI包名(忽略requirements文件、本地安装、git仓库等) +3. 保留包名和可选的extras(如package[extra]) +4. 忽略版本约束,只要包名 + +请以JSON格式返回结果,格式如下: +{{ + "仓库名1": ["package1", "package2[extras]"], + "仓库名2": ["package3", "package4"], + ... +}} + +只返回JSON,不要添加其他说明文字。 +""" + + try: + message = {"role": "user", "content": prompt} + # Build proper chat-style messages list for the LLM API + messages = [message] + # 调用内部的 LLM API; llm_api expects 'messages' to be a list of message dicts + content = call_llm_api(messages, verbose=False) + print(f"LLM批量分析结果: {content}") + + # 尝试解析JSON响应 + try: + # 移除可能的markdown代码块标记 + if content.startswith('```json'): + content = content[7:] + if content.endswith('```'): + content = content[:-3] + + parsed_result = json.loads(content) + print(f"LLM批量分析成功,处理了 {len(parsed_result)} 个仓库") + print(f"处理结果: {parsed_result}") + # 在返回结果后,更新最后调用时间并强制等待完整间隔 + cls._last_llm_call_time = time.time() + print(f"QPM限制:强制等待完整间隔 {interval:.2f} 秒...") + time.sleep(interval) + + return parsed_result + + except json.JSONDecodeError as e: + print(f"解析LLM响应JSON失败: {e}") + print(f"响应内容: {content}") + + # 即使解析失败也要更新时间并等待 + cls._last_llm_call_time = time.time() + print(f"QPM限制:强制等待完整间隔 {interval:.2f} 秒...") + time.sleep(interval) + return {} + + except Exception as e: + print(f"LLM批量分析出错: {str(e)}") + + # 即使出错也要更新时间并等待 + cls._last_llm_call_time = time.time() + print(f"QPM限制:强制等待完整间隔 {interval:.2f} 秒...") + time.sleep(interval) + return {} + +class FileProcessor: + + @staticmethod + def init_database(db_path: str = "repos.db"): + """初始化SQLite数据库""" + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # 创建仓库信息表 + cursor.execute(''' + CREATE TABLE IF NOT EXISTS repositories ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT NOT NULL, + url TEXT NOT NULL UNIQUE, + owner TEXT, + repo_name TEXT, + stars INTEGER, + last_updated TEXT, + description TEXT, + language TEXT, + pip_packages TEXT, + license_name TEXT, + size_kb INTEGER, + created_at TEXT, + processed_at TEXT DEFAULT CURRENT_TIMESTAMP, + UNIQUE(owner, repo_name) + ) + ''') + + conn.commit() + conn.close() + print(f"数据库已初始化: {db_path}") + + @staticmethod + def fetch_repo_metadata(owner: str, repo: str, token: str = None) -> dict: + """获取GitHub仓库的元数据信息""" + headers = {'Authorization': f'token {token}'} if token else {} + + try: + response = requests.get( + f"https://api.github.com/repos/{owner}/{repo}", + headers=headers, + timeout=10 + ) + + if response.status_code == 200: + data = response.json() + return { + 'stars': data.get('stargazers_count', 0), + 'last_updated': data.get('updated_at', ''), + 'description': data.get('description', ''), + 'language': data.get('language', ''), + 'license_name': data.get('license', {}).get('name', '') if data.get('license') else '', + 'size_kb': data.get('size', 0), + 'created_at': data.get('created_at', '') + } + else: + print(f"获取仓库元数据失败 {owner}/{repo}: HTTP {response.status_code}") + return {} + + except Exception as e: + print(f"获取仓库元数据出错 {owner}/{repo}: {str(e)}") + return {} + + @staticmethod + def save_to_database(name: str, url: str, owner: str, repo: str, + packages: list, metadata: dict = None, + db_path: str = "repos.db"): + """将仓库信息保存到数据库""" + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + # 准备数据 + packages_str = json.dumps(packages) if packages else '[]' + processed_at = datetime.now().isoformat() + + # 如果没有提供元数据,使用默认值 + if metadata is None: + metadata = {} + + try: + # 使用INSERT OR REPLACE来处理重复数据 + cursor.execute(''' + INSERT OR REPLACE INTO repositories + (name, url, owner, repo_name, stars, last_updated, description, + language, pip_packages, license_name, size_kb, created_at, processed_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ''', ( + name, + url, + owner, + repo, + metadata.get('stars', 0), + metadata.get('last_updated', ''), + metadata.get('description', ''), + metadata.get('language', ''), + packages_str, + metadata.get('license_name', ''), + metadata.get('size_kb', 0), + metadata.get('created_at', ''), + processed_at + )) + + conn.commit() + print(f"已保存到数据库: {name} ({owner}/{repo})") + + except sqlite3.Error as e: + print(f"数据库保存失败 {name}: {str(e)}") + finally: + conn.close() + + + @staticmethod + def process_io(input_path: str, output_path: str, token: str = None, + use_llm: bool = False, qpm: int = 1, batch_size: int = 5, + save_to_db: bool = False, db_path: str = "repos.db", + cache_days: int = 30): + """执行批处理流程 - 支持LLM分析选项、批量处理、数据库存储和缓存 + + Args: + cache_days: 缓存有效期(天数),默认30天 + """ + + # 如果启用数据库存储,初始化数据库 + if save_to_db: + FileProcessor.init_database(db_path) + with open(input_path, 'r', encoding='utf-8') as infile: + lines = [line.strip() for line in infile if line.strip()] + + # 开始时清空输出文件 + with open(output_path, 'w', encoding='utf-8') as outfile: + pass # 只是为了清空文件 + + i = 0 + while i < len(lines): + if use_llm and i + batch_size <= len(lines): + # 批量处理 + batch_lines = lines[i:i+batch_size] + batch_data = [] + valid_entries = [] + error_results = [] # 收集错误结果 + + # 收集批次中的有效条目 + for line in batch_lines: + parts = re.split(r'\s+', line, maxsplit=1) + if len(parts) < 2: + error_results.append(f"{line} INVALID_FORMAT []\n") + continue + + name, url = parts[0], parts[1] + owner, repo = ReadmeAnalyzer.extract_repo_info(url) + + if not owner or not repo: + error_results.append(f"{name} {url} INVALID_URL []\n") + continue + + # 检查缓存 + is_cached, cached_data = FileProcessor.check_cache_validity(owner, repo, db_path, cache_days) + if is_cached and cached_data: + # 使用缓存数据直接输出 + packages_list = f"[{', '.join(cached_data['pip_packages'])}]" + cached_result = f"{name} {url} {packages_list}\n" + + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(cached_result) + + print(f"使用缓存数据: {name} ({owner}/{repo}) - 处理时间: {cached_data['processed_at']}") + continue + + try: + content = ReadmeAnalyzer.fetch_readme(owner, repo, token) + if not content: + error_results.append(f"{name} {url} README_NOT_FOUND []\n") + # 即使README未找到,也要保存到数据库避免重复处理 + if save_to_db: + owner_info, repo_info = ReadmeAnalyzer.extract_repo_info(url) + if owner_info and repo_info: + metadata = FileProcessor.fetch_repo_metadata(owner_info, repo_info, token) + FileProcessor.save_to_database(name, url, owner_info, repo_info, + [], metadata, db_path) + continue + + batch_data.append({ + 'repo_name': repo, + 'readme_content': content + }) + valid_entries.append((name, url, repo)) + + except Exception as e: + error_results.append(f"{name} {url} ERROR: {str(e)} []\n") + + # 先写入错误结果 + with open(output_path, 'a', encoding='utf-8') as outfile: + for error_result in error_results: + outfile.write(error_result) + + # 如果有有效的条目,批量调用LLM + if batch_data: + print(f"批量LLM分析 {len(batch_data)} 个仓库的README...") + llm_results = ReadmeAnalyzer.analyze_batch_with_llm(batch_data, qpm) + + # 处理LLM结果并写入文件 + batch_results = [] + for j, (name, url, repo) in enumerate(valid_entries): + packages = llm_results.get(repo, []) + + # 后续处理逻辑保持不变 + unique_packages = [] + seen = set() + for pkg in packages: + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg not in seen: + unique_packages.append(pkg) + seen.add(base_pkg) + + # 不区分大小写匹配仓库名,将匹配的包放在第一位 + repo_name_lower = repo.lower().replace('-', '_') + matched_packages = [] + other_packages = [] + + for pkg in unique_packages: + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg == repo_name_lower: + matched_packages.append(pkg) + else: + other_packages.append(pkg) + + # 重新排序:匹配的包在前,其他包在后 + reordered_packages = matched_packages + other_packages + + # 格式化包列表 + packages_list = f"[{', '.join(reordered_packages)}]" + + # 收集结果 + batch_results.append(f"{name} {url} {packages_list}\n") + + # 如果启用数据库存储,保存到数据库 + if save_to_db: + # 获取仓库元数据 + owner, repo_name = ReadmeAnalyzer.extract_repo_info(url) + if owner and repo_name: + metadata = FileProcessor.fetch_repo_metadata(owner, repo_name, token) + FileProcessor.save_to_database(name, url, owner, repo_name, + reordered_packages, metadata, db_path) + + # 将batch结果追加到输出文件 + with open(output_path, 'a', encoding='utf-8') as outfile: + for result in batch_results: + outfile.write(result) + + print(f"已完成批次处理,结果已追加到 {output_path}") + + i += batch_size + + else: + # 单个处理(当不使用LLM或剩余条目不足批次大小时) + line = lines[i] + parts = re.split(r'\s+', line, maxsplit=1) + if len(parts) < 2: + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{line} INVALID_FORMAT []\n") + i += 1 + continue + + name, url = parts[0], parts[1] + owner, repo = ReadmeAnalyzer.extract_repo_info(url) + + if not owner or not repo: + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{name} {url} INVALID_URL []\n") + i += 1 + continue + + # 检查缓存 + is_cached, cached_data = FileProcessor.check_cache_validity(owner, repo, db_path, cache_days) + if is_cached and cached_data: + # 使用缓存数据直接输出 + packages_list = f"[{', '.join(cached_data['pip_packages'])}]" + + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{name} {url} {packages_list}\n") + + print(f"使用缓存数据: {name} ({owner}/{repo}) - 处理时间: {cached_data['processed_at']}") + i += 1 + continue + + try: + content = ReadmeAnalyzer.fetch_readme(owner, repo, token) + if not content: + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{name} {url} README_NOT_FOUND []\n") + + # 即使README未找到,也要保存到数据库避免重复处理 + if save_to_db: + metadata = FileProcessor.fetch_repo_metadata(owner, repo, token) + FileProcessor.save_to_database(name, url, owner, repo, + [], metadata, db_path) + i += 1 + continue + + # 选择分析方法 + if use_llm: + print(f"使用LLM分析 {name} 的README...") + packages = ReadmeAnalyzer.analyze_with_llm(content, repo, qpm) + else: + print(f"使用正则表达式分析 {name} 的README...") + packages = ReadmeAnalyzer.parse_pip_commands(content) + + # 去重处理 + unique_packages = [] + seen = set() + for pkg in packages: + # 处理带extras的包名进行更精确的去重 + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg not in seen: + unique_packages.append(pkg) + seen.add(base_pkg) + + # 不区分大小写匹配仓库名,将匹配的包放在第一位 + repo_name_lower = repo.lower().replace('-', '_') + matched_packages = [] + other_packages = [] + + for pkg in unique_packages: + base_pkg = pkg.split('[')[0].lower().replace('-', '_') + if base_pkg == repo_name_lower: + matched_packages.append(pkg) + else: + other_packages.append(pkg) + + # 重新排序:匹配的包在前,其他包在后 + reordered_packages = matched_packages + other_packages + + # 格式化包列表 + packages_list = f"[{', '.join(reordered_packages)}]" + + # 直接追加到输出文件 + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{name} {url} {packages_list}\n") + + # 如果启用数据库存储,保存到数据库 + if save_to_db: + metadata = FileProcessor.fetch_repo_metadata(owner, repo, token) + FileProcessor.save_to_database(name, url, owner, repo, + reordered_packages, metadata, db_path) + + except Exception as e: + with open(output_path, 'a', encoding='utf-8') as outfile: + outfile.write(f"{name} {url} ERROR: {str(e)} []\n") + + i += 1 + + + @staticmethod + def check_cache_validity(owner: str, repo: str, db_path: str = "repos.db", cache_days: int = 30) -> tuple: + """检查数据库中是否存在有效的缓存数据 + + Args: + owner: 仓库所有者 + repo: 仓库名 + db_path: 数据库路径 + cache_days: 缓存有效期(天数) + + Returns: + tuple: (is_cached, cached_data) + - is_cached: bool, 是否存在有效缓存 + - cached_data: dict, 缓存的数据(如果存在) + """ + if not os.path.exists(db_path): + return False, None + + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + + try: + cursor.execute(''' + SELECT name, url, owner, repo_name, stars, last_updated, + description, language, pip_packages, license_name, + size_kb, created_at, processed_at + FROM repositories + WHERE owner = ? AND repo_name = ? + ''', (owner, repo)) + + row = cursor.fetchone() + if not row: + return False, None + + # 检查处理时间是否超过缓存期限 + processed_at = row[12] # processed_at字段 + if processed_at: + try: + processed_time = datetime.fromisoformat(processed_at) + current_time = datetime.now() + time_diff = current_time - processed_time + + if time_diff.days <= cache_days: + # 缓存仍然有效,返回缓存数据 + cached_data = { + 'name': row[0], + 'url': row[1], + 'owner': row[2], + 'repo_name': row[3], + 'stars': row[4], + 'last_updated': row[5], + 'description': row[6], + 'language': row[7], + 'pip_packages': json.loads(row[8]) if row[8] else [], + 'license_name': row[9], + 'size_kb': row[10], + 'created_at': row[11], + 'processed_at': row[12] + } + return True, cached_data + except ValueError: + # 如果时间格式解析失败,视为无效缓存 + pass + + return False, None + + except sqlite3.Error as e: + print(f"数据库查询缓存失败: {str(e)}") + return False, None + finally: + conn.close() + +# 使用示例 +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GitHub仓库获取PyPI包名工具") + parser.add_argument('--input', type=str, default='repos.txt', help='输入文件路径,包含仓库列表') + parser.add_argument('--output', type=str, default='results.txt', help='输出文件路径') + + args = parser.parse_args() + + CONFIG_FILE = f"{root_dir}/config.json" + GITHUB_TOKEN = json.load(open(CONFIG_FILE, 'r'))['github_access_token'] + + # 可以选择使用LLM分析 + USE_LLM = False # 设置为True以启用LLM分析 + QPM = 1 # 每分钟请求次数限制 + BATCH_SIZE = 5 # 批处理大小,默认每次处理5个仓库 + + # 数据库存储选项 + SAVE_TO_DB = False # 设置为True以启用数据库存储 + DB_PATH = "repos.db" # 数据库文件路径 + CACHE_DAYS = 30 # 缓存有效期(天数) + + FileProcessor.process_io(args.input, args.output, GITHUB_TOKEN, + use_llm=USE_LLM, qpm=QPM, batch_size=BATCH_SIZE, + save_to_db=SAVE_TO_DB, db_path=DB_PATH, + cache_days=CACHE_DAYS) + print(f"处理完成!结果已保存至 {args.output}。") \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/github_tools.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/github_tools.py" new file mode 100644 index 0000000000000000000000000000000000000000..21161fc2c2310776041e1c3f37054c12d7751f35 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/info_crawler/github_tools.py" @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +GitHub Repo Spider +Usage: + # 公开仓库 + python github_spider.py https://github.com/owner/repo + + # 私有仓库(需 token) + GITHUB_TOKEN=ghp_xxx python github_spider.py https://github.com/owner/private-repo + + # 只抓取文件树 + python github_spider.py https://github.com/owner/repo --tree-only + + # 只抓取指定文件 + python github_spider.py https://github.com/owner/repo --target README.md requirements.txt +""" +import sys +import json +import base64 +import argparse +import requests +from typing import List, Dict, Any +from pathlib import Path + +# -------------------- 配置 -------------------- +root_dir = Path(__file__).parent.parent +CONFIG_FILE = f"{root_dir}/config.json" +API_BASE = "https://api.github.com" +TOKEN = json.load(open(CONFIG_FILE, 'r'))['github_access_token'] +HEADERS = { + "Authorization": f"Bearer {TOKEN}", + "Accept": "application/vnd.github+json" +} +# --------------------------------------------- + +sys.path.insert(0, str(root_dir)) +from logger.logger import get_logger + +logger = get_logger("GitHubTools") + +def api_get(url: str) -> Dict[str, Any]: + """通用 GET 封装,自动处理分页 & 限频""" + items = [] + while url: + r = requests.get(url, headers=HEADERS, timeout=30) + if r.status_code != 200: + logger.error(f"{r.status_code} {r.text}") + raise Exception(f"GitHub API 请求失败: {r.status_code}") + # 如果返回数组,则可能是分页 + if isinstance(r.json(), list): + items.extend(r.json()) + link = r.headers.get("Link", "") + next_url = None + for part in link.split(","): + if 'rel="next"' in part: + next_url = part[part.find("<") + 1: part.find(">")] + url = next_url + else: + return r.json() + return items + + +def parse_repo_url(url: str) -> tuple[str, str]: + """从任意 GitHub URL 提取 owner, repo""" + url = url.rstrip("/") + if url.endswith(".git"): + url = url[:-4] + parts = url.split("/") + if len(parts) < 5 or parts[2] != "github.com": + logger.error("仅支持 https://github.com/owner/repo 格式") + raise ValueError("Invalid GitHub URL") + return parts[3], parts[4] + + +def list_tree(repo_url: str, branch: str = "HEAD") -> List[str]: + """递归列出仓库内所有文件路径""" + try: + owner, repo = parse_repo_url(repo_url) + url = f"{API_BASE}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1" + data = api_get(url) + return [node["path"] for node in data.get("tree", []) if node["type"] == "blob"] + except Exception as e: + logger.error(f"{e}") + return [] + + +def fetch_file(repo_url: str, path: str) -> str: + """读取文件内容(已自动 base64 解码)""" + try: + owner, repo = parse_repo_url(repo_url) + url = f"{API_BASE}/repos/{owner}/{repo}/contents/{path}" + data = api_get(url) + if data.get("encoding") == "base64": + return base64.b64decode(data["content"]).decode("utf-8") + except Exception as e: + logger.error(f"{e}") + return data.get("content", "") + + +def main(): + parser = argparse.ArgumentParser(description="GitHub Repo Spider") + parser.add_argument("url", help="仓库地址,例如 https://github.com/owner/repo") + parser.add_argument("--tree-only", action="store_true", help="只输出文件树") + parser.add_argument("--target", nargs="*", default=["README.md", "requirements.txt"], + help="需要抓取内容的文件名(支持通配符匹配)") + parser.add_argument("--branch", default="HEAD", help="分支或 commit SHA,默认 HEAD") + args = parser.parse_args() + + # 1. 文件树 + tree = list_tree(args.url, args.branch) + print(f"[info] 共发现 {len(tree)} 个文件") + if args.tree_only: + for p in tree: + print(p) + return + + # 2. 抓取目标文件 + out: Dict[str, Any] = {"tree": tree, "files": {}} + for pattern in args.target: + # 支持通配符:README* / *.py + import fnmatch + matched = fnmatch.filter(tree, pattern) + if not matched: + print(f"[warn] 未匹配到 {pattern}") + continue + for file_path in matched: + print(f"[info] 读取 {file_path}") + try: + out["files"][file_path] = fetch_file(args.url, file_path) + except Exception as e: + out["files"][file_path] = f"[error] {e}" + + # 3. 保存结果 + owner, repo = parse_repo_url(args.url) + save_to = f"{owner}_{repo}.json" + with open(save_to, "w", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + print(f"[done] 结果已保存到 {save_to}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/logger/logger.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/logger/logger.py" new file mode 100644 index 0000000000000000000000000000000000000000..3074ff4ef3b2c3b8a4bc6277184d52651e3e9661 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/logger/logger.py" @@ -0,0 +1,170 @@ +import os +import sys +import json +from typing import Dict, List +import logging +from datetime import datetime +try: + import colorama # type: ignore + colorama.init() + _HAS_COLORAMA = True +except ImportError: + _HAS_COLORAMA = False + +"""终端日志工具""" +_LEVEL2COLOR = { + logging.DEBUG: "\033[36m", # cyan + logging.INFO: "\033[32m", # green + logging.WARNING: "\033[33m", # yellow + logging.ERROR: "\033[31m", # red + logging.CRITICAL: "\033[35m", # magenta +} +_RESET = "\033[0m" + +class _ColoredFormatter(logging.Formatter): + """彩色控制台格式器""" + def format(self, record): + # 仅对 levelname 上色 + if _HAS_COLORAMA or os.name != "nt": + color = _LEVEL2COLOR.get(record.levelno, "") + record.levelname = f"{color}{record.levelname}{_RESET}" + return super().format(record) + + +def _add_handlers(logger: logging.Logger, + console_level: int = logging.INFO, + max_bytes: int = 10 * 1024 * 1024, + backup_count: int = 5): + """给 logger 添加控制台 + 文件 handler""" + # 防止重复 + if logger.handlers: + return + logger.setLevel(logging.DEBUG) # 全局最低 + + # 1) 控制台 + console = logging.StreamHandler(sys.stdout) + console.setLevel(console_level) + console.setFormatter( + _ColoredFormatter( + "[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s", + datefmt="%H:%M:%S" + ) + ) + logger.addHandler(console) + logger.propagate = False # 避免向上传递重复输出 + + +def get_logger(name: str, + console_level: int = logging.INFO, + log_dir: str = None, + file_level: int = logging.DEBUG): + """ + 获取一个彩色 logger + :param name: logger 名,通常 __name__ + :param console_level: 控制台级别 + :param log_dir: 日志文件夹,None 则不写文件 + :param file_level: 文件日志级别 + :return: logging.Logger + """ + logger = logging.getLogger(name) + _add_handlers(logger, console_level, log_dir, file_level) + return logger + +# 状态集合,包括NOT_FOUND,INSTALL_FAILED, INCOMPATIBLE, COMPATIBLE四种, +class Status: + CREATE_ENV_FAILED = "CREATE_ENV_FAILED" + INSTALL_FAILED = "INSTALL_FAILED" + ENV_RESOLVE_FAILED = "ENV_RESOLVE_FAILED" + INCOMPATIBLE = "INCOMPATIBLE" + COMPATIBLE = "COMPATIBLE" + OTHER_ERROR = "OTHER_ERROR" + +"""日志文件生成工具""" +class ResultLogger: + """结果记录器""" + + def __init__(self, output_dir: str = "../results"): + self.logger = get_logger("日志记录") + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + def save_results(self, package_name: str, results: Dict) -> str: + """保存测试结果到JSON文件""" + filename = f"{package_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + filepath = os.path.join(self.output_dir, filename) + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + + self.logger.info(f"结果已保存到: {filepath}") + return filepath + + def generate_summary_report(self, all_results: List[Dict], exists_packages_num: int, total_packages_num: int) -> Dict: + """ + 生成汇总报告 + + Arguments: + - all_results: 记录的结果列表,一个包可能有多个记录,比如 + - exists_packages_num: 存在的包数量 + - total_packages_num: 总包数量 + Returns: - 汇总报告字典 + """ + summary = { + 'total_packages': total_packages_num, + 'not_found_packages': total_packages_num - exists_packages_num, + 'total_exists_packages': exists_packages_num, + 'successful_packages': 0, + 'install_failed_packages': 0, + 'create_env_failed_packages': 0, + 'env_resolve_failed_packages': 0, + 'verify_failed_packages': 0, + 'install_rate((total_exists_packages - install_failed_packages)/total_exists_packages)': 0.0, + 'compatibility_rate(successful_packages/total_exists_packages)': 0.0, + 'install_rate_total((total_packages - install_failed_packages)/total_packages)': 0.0, + 'compatibility_rate_total(successful_packages/total_packages)': 0.0, + 'details': [], + 'timestamp': datetime.now().isoformat() + } + + for result in all_results: + if result.get('status') == Status.COMPATIBLE: + summary['successful_packages'] += 1 + elif result.get('status') == Status.INSTALL_FAILED: + summary['install_failed_packages'] += 1 + elif result.get('status') == Status.CREATE_ENV_FAILED: + summary['create_env_failed_packages'] += 1 + elif result.get('status') == Status.ENV_RESOLVE_FAILED: + summary['env_resolve_failed_packages'] += 1 + elif result.get('status') == Status.INCOMPATIBLE: + summary['verify_failed_packages'] += 1 + else: + self.logger.error(f"未知的结果状态: {result}") + + summary['details'].append({ + 'package_name': result['package_name'], + 'status': result.get('status', 'UNKNOWN'), + 'test_summary': result.get('summary', {}) + }) + + if total_packages_num > 0: + summary['install_rate_total'] = (total_packages_num - summary['install_failed_packages']) / total_packages_num + summary['compatibility_rate_total'] = summary['successful_packages'] / total_packages_num + if exists_packages_num > 0: + summary['install_rate'] = (exists_packages_num - summary['install_failed_packages']) / exists_packages_num + summary['compatibility_rate'] = summary['successful_packages'] / exists_packages_num + # 保存汇总报告 + summary_file = os.path.join(self.output_dir, f"summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json") + with open(summary_file, 'w', encoding='utf-8') as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + + self.logger.info(f"汇总报告已保存到: {summary_file}") + return summary + +if __name__ == "__main__": + result_logger = ResultLogger() + result_logger.save_results("example_package", {"status": "COMPATIBLE"}) + result_logger.generate_summary_report([ + {'package_name': 'pkg1', 'status': Status.COMPATIBLE}, + {'package_name': 'pkg2', 'status': Status.INSTALL_FAILED}, + {'package_name': 'pkg3', 'status': Status.INCOMPATIBLE} + ], exists_packages_num=3, total_packages_num=5) \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/configuration.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/configuration.py" new file mode 100644 index 0000000000000000000000000000000000000000..7ec530d7ef414a5f2458ecefdd0a0c6c857c75ff --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/configuration.py" @@ -0,0 +1,100 @@ +import json +import os +from typing import Any, Optional + +import dotenv + + +class Configuration: + """Manages configuration and environment variables for the MCP client.""" + + def __init__(self) -> None: + """Initialize configuration with environment variables.""" + self.load_env() + self._llm_api_key = os.getenv("LLM_API_KEY") + self._llm_base_url = os.getenv("LLM_BASE_URL") + self._llm_model_name = os.getenv("LLM_MODEL_NAME") + + self._ollama_model_name = os.getenv("OLLAMA_MODEL_NAME") + self._ollama_base_url = os.getenv("OLLAMA_BASE_URL") + + @staticmethod + def load_env() -> None: + """Load environment variables from .env file.""" + dotenv.load_dotenv() + + @staticmethod + def load_config(file_path: str) -> dict[str, Any]: + """Load server configuration from JSON file. + + Args: + file_path: Path to the JSON configuration file. + + Returns: + Dict containing server configuration. + + Raises: + FileNotFoundError: If configuration file doesn't exist. + JSONDecodeError: If configuration file is invalid JSON. + """ + with open(file_path, "r") as f: + return json.load(f) + + @property + def llm_api_key(self) -> str: + """Get the LLM API key. + + Returns: + The API key as a string. + + Raises: + ValueError: If the API key is not found in environment variables. + """ + if not self._llm_api_key: + raise ValueError("LLM_API_KEY not found in environment variables") + return self._llm_api_key + + @property + def llm_base_url(self) -> Optional[str]: + """Get the LLM base URL. + + Returns: + The base URL as a string. + """ + return self._llm_base_url + + @property + def llm_model_name(self) -> str: + """Get the LLM model name. + + Returns: + The model name as a string. + + Raises: + ValueError: If the model name is not found in environment variables. + """ + if not self._llm_model_name: + raise ValueError("LLM_MODEL_NAME not found in environment variables") + return self._llm_model_name + + @property + def ollama_model_name(self) -> str: + """Get the Ollama model name. + + Returns: + The model name as a string. + """ + if not self._ollama_model_name: + raise ValueError("OLLAMA_MODEL_NAME not found in environment variables") + return self._ollama_model_name + + @property + def ollama_base_url(self) -> Optional[str]: + """Get the Ollama base URL. + + Returns: + The base URL as a string. + """ + if not self._ollama_base_url: + raise ValueError("OLLAMA_BASE_URL not found in environment variables") + return self._ollama_base_url diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_client.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_client.py" new file mode 100644 index 0000000000000000000000000000000000000000..1dc54beb232d13a53177ed3a7e950c3c716df6aa --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_client.py" @@ -0,0 +1,144 @@ +import asyncio +import os +import sys +import shutil +from pathlib import Path +from contextlib import AsyncExitStack +from typing import Any, List + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + + +root_dir = Path(__file__).parent.parent +sys.path.insert(0, str(root_dir)) + +from logger.logger import get_logger +from mcp_chat_bot.mcp_tool import MCPTool + +class MCPClient: + """MCPClient manages connections to MCP server.""" + + def __init__(self, name: str, conifg: dict[str, Any]) -> None: + self.name: str = name + self.config: dict[str, Any] = conifg + self.stdio_context: Any | None = None + self.session: ClientSession | None = None + self._cleanup_lock: asyncio.Lock = asyncio.Lock() + self.exit_stack: AsyncExitStack = AsyncExitStack() + self.logger = get_logger(f"MCPClient-{self.name}") + + async def initialize(self) -> None: + """Initialize the server connection.""" + command = ( + shutil.which("npx") + if self.config["command"] == "npx" + else self.config["command"] + ) + server_params = StdioServerParameters( + command=command, + args=self.config["args"], + env={**os.environ, **self.config["env"]} + if self.config.get("env") + else None, + ) + try: + stdio_transport = await self.exit_stack.enter_async_context( + stdio_client(server_params) + ) + read, write = stdio_transport + session = await self.exit_stack.enter_async_context( + ClientSession(read, write) + ) + await session.initialize() + self.session = session + except Exception as e: + self.logger.error(f"Error initializing server {self.name}: {e}") + await self.cleanup() + raise + + async def list_tools(self) -> List[MCPTool]: + """List available tools from the server. + + Returns: + A list of available tools. + + Raises: + RuntimeError: If the server is not initialized. + """ + if not self.session: + raise RuntimeError(f"Server {self.name} not initialized") + + tools_response = await self.session.list_tools() + tools = [] + + for item in tools_response: + if isinstance(item, tuple) and item[0] == "tools": + for tool in item[1]: + tools.append(MCPTool(tool.name, tool.description, tool.inputSchema)) + + return tools + + async def execute_tool( + self, + tool_name: str, + arguments: dict[str, Any], + retries: int = 2, + delay: float = 1.0, + ) -> Any: + """Execute a tool with retry mechanism. + + Args: + tool_name: Name of the tool to execute. + arguments: Tool arguments. + retries: Number of retry attempts. + delay: Delay between retries in seconds. + + Returns: + Tool execution result. + + Raises: + RuntimeError: If server is not initialized. + Exception: If tool execution fails after all retries. + """ + if not self.session: + raise RuntimeError(f"Server {self.name} not initialized") + + attempt = 0 + while attempt < retries: + try: + self.logger.info(f"Executing {tool_name}...") + result = await self.session.call_tool(tool_name, arguments) + + return result + + except Exception as e: + attempt += 1 + self.logger.warning( + f"Error executing tool: {e}. Attempt {attempt} of {retries}." + ) + if attempt < retries: + self.logger.info(f"Retrying in {delay} seconds...") + await asyncio.sleep(delay) + else: + self.logger.error("Max retries reached. Failing.") + raise + + async def cleanup(self) -> None: + """Clean up server resources.""" + async with self._cleanup_lock: + try: + await self.exit_stack.aclose() + self.session = None + self.stdio_context = None + except Exception as e: + self.logger.error(f"Error during cleanup of server {self.name}: {e}") + + async def __aenter__(self): + """Enter the async context manager.""" + await self.initialize() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Exit the async context manager.""" + await self.cleanup() diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/dependency_analyst.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/dependency_analyst.py" new file mode 100644 index 0000000000000000000000000000000000000000..9a583087ffe7031f3182c55d59f4cc7efc71ea65 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/dependency_analyst.py" @@ -0,0 +1,43 @@ +import sys +from pathlib import Path +from typing import List + +from mcp.server.fastmcp import FastMCP + +root_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(root_dir)) + +from info_crawler.analyse_dependency import find_dependency_for_pip_package, detect_gpu_requirement + +mcp = FastMCP("Dependency Analyse Tool") + +@mcp.tool() +def find_dependency_for_pip_package_mcp(package: str, version: str = None, include_extras: bool = False) -> List[str]: + """Analyze the dependency of a given PyPI package. + + Args: + package: The name of the PyPI package. + version: The version of the package (optional). + include_extras: Whether to include extra dependencies (default: False). + + Returns: + A list of dependencies for the specified package and version. + """ + return find_dependency_for_pip_package(package, version, include_extras) + +@mcp.tool() +def detect_gpu_requirement_for_pip_package_mcp(package: str, version: str = None) -> bool: + """Detect the GPU requirement of a given PyPI package. + + Args: + package: The name of the PyPI package. + version: The version of the package (optional). + + Returns: + A boolean indicating whether the package requires a GPU(Even the GPU is optional, the result still be true). + """ + return detect_gpu_requirement(package, version)['gpu'] + +if __name__ == "__main__": + # Initialize and run the server + mcp.run() \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/github_analyst.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/github_analyst.py" new file mode 100644 index 0000000000000000000000000000000000000000..8cd15e4bf0db9e7eef76ad41407bb97314d6e32a --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/github_analyst.py" @@ -0,0 +1,49 @@ +import sys +from pathlib import Path +from typing import List + +from mcp.server.fastmcp import FastMCP + +root_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(root_dir)) + +from info_crawler.github_tools import list_tree, fetch_file + +# Create a Simple MCP Server +mcp = FastMCP("Github Analyse Tool") + +@mcp.tool() +def find_files_mcp(repo_url: str, pattern: str, branch: str = 'HEAD') -> List[str]: + """Find files matching a pattern in a GitHub repository. + + Args: + repo_url: The URL of the GitHub repository. + pattern: The pattern to match file paths against (e.g., 'README.md' for introduction files). + branch: The branch or commit SHA, default value is HEAD (Optional). + + Returns: + A list of file paths that match the given pattern. + """ + tree = list_tree(repo_url, branch) + import fnmatch + + matched_files = [f for f in tree if fnmatch.fnmatch(f, pattern)] + return matched_files + +@mcp.tool() +def fetch_file_mcp(repo_url: str, path: str) -> str: + """Fetch file content from a GitHub repository. + + Args: + repo_url: The URL of the GitHub repository. + path: The path to the file in the repository. + + Returns: + The content of the file as a string. + """ + content = fetch_file(repo_url, path) + return content + +if __name__ == "__main__": + # Initialize and run the server + mcp.run() \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/mcp_servers_config.json" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/mcp_servers_config.json" new file mode 100644 index 0000000000000000000000000000000000000000..2a46857dc3085219d55048a938f5874a7c517615 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/mcp_servers_config.json" @@ -0,0 +1,32 @@ +{ + "mcpServers": { + "github_analyst": { + "command": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/github_analyst.py" + ] + }, + "pypi_analyst": { + "command": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/pypi_analyst.py" + ] + }, + "dependency_analyst": { + "command": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/dependency_analyst.py" + ] + }, + "test_executor": { + "command": "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/.main_venv/bin/python", + "args": [ + "-u", + "/root/contributor_rhino-bird/2025实战任务_作品文件夹/OpenCloudOS 9 AI软件自动化验证工具/黄振业_作品/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/test_executor.py" + ] + } + } +} \ No newline at end of file diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/pypi_analyst.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/pypi_analyst.py" new file mode 100644 index 0000000000000000000000000000000000000000..9dc6d4325e6c877f0d99248c1e721b4005f8fe54 --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/pypi_analyst.py" @@ -0,0 +1,79 @@ +import sys +from pathlib import Path +from typing import Dict, List, Any + +from mcp.server.fastmcp import FastMCP + +root_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(root_dir)) + + +from info_crawler.analyse_pypi import check_pypi, check_pypi_list, detect_candidate_package_names +from info_crawler.github_tools import list_tree + +mcp = FastMCP("PyPI Analysis Tool") + +@mcp.tool() +def check_pypi_mcp(candidate_package_name: str) -> Dict[str, Any]: + """Check if a candidate package name exists on PyPI. + + Args: + candidate_package_name: The candidate package name to check. + + Returns: + A dictionary containing: + - 'exists': True if the package exists on PyPI, False otherwise. + - 'info': The package information if it exists, Empty otherwise. + """ + exists, info = check_pypi(candidate_package_name) + if not info: + analysis = { + "exists": True, + "name": info.get("name"), + "version": info.get("version"), + "summary": info.get("summary"), + "author": info.get("author"), + "license": info.get("license"), + "home_page": info.get("home_page"), + "requires_python": info.get("requires_python"), + "dependencies": info.get("requires_dist", []), + "keywords": info.get("keywords"), + "classifiers": info.get("classifiers", []), + "project_urls": info.get("project_urls", {}), + "upload_time": info.get("upload_time"), + "yanked": info.get("yanked", False), + } + return {"exists": exists, "info": analysis if exists else {}} + +@mcp.tool() +def check_pypi_list_mcp(candidate_package_names: List[str]) -> Dict[str,bool]: + """Check if candidate package names exist on PyPI. + + Args: + candidate_package_names: List of candidate package names to check. + + Returns: + A dictionary mapping candidate package names to their existence status (True means exists, False means no exists) on PyPI. + """ + return check_pypi_list(candidate_package_names) + + +@mcp.tool() +def try_detect_candidate_package_names_mcp(repo_url: str, file_list: List[str] = None) -> List[str]: + """Try to detect likely PyPI package names from GitHub repository files. Only support the repo's url like 'https://github.com/owner/repo'. + + Args: + repo_url: The URL of the GitHub repository. + file_list: List of file paths in the repository (Optional). + + Returns: + A list of detected candidate package names. + """ + repo_name = repo_url.split('/')[-1] # Extract repo name from URL + if file_list is None: + file_list = list_tree(repo_url) + return detect_candidate_package_names(repo_url, repo_name, file_list) + +if __name__ == "__main__": + # Initialize and run the server + mcp.run() diff --git "a/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/test_executor.py" "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/test_executor.py" new file mode 100644 index 0000000000000000000000000000000000000000..382c9eadf04d24b28ae2fdeed1ddec799c578f5c --- /dev/null +++ "b/2025\345\256\236\346\210\230\344\273\273\345\212\241_\344\275\234\345\223\201\346\226\207\344\273\266\345\244\271/OpenCloudOS 9 AI\350\275\257\344\273\266\350\207\252\345\212\250\345\214\226\351\252\214\350\257\201\345\267\245\345\205\267/\351\273\204\346\214\257\344\270\232_\344\275\234\345\223\201/oc_contributor_huangzhenye/code/mcp_chat_bot/mcp_servers/test_executor.py" @@ -0,0 +1,173 @@ +import asyncio +import sys +import json +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +from mcp.server.fastmcp import FastMCP + +root_dir = Path(__file__).parent.parent.parent +sys.path.insert(0, str(root_dir)) + +from package_manager.package_tester import TestExecutionEngine +from package_manager.environment_resolver import EnvironmentResolver +from package_manager.package_installer import PackageInstaller +from utils.create_venv import create_venv + +CONFIG_PATH = f'{root_dir}/config.json' +with open(CONFIG_PATH, 'r') as f: + config = json.load(f) + VENVS_DIR = config['venvs_path'] + +mcp = FastMCP("Test Execution Error Resolution Tool") + +@mcp.tool() +async def execute_test_case_mcp(package_name: str, venv_name: str, test_type: str, test_case: str, expected_result: str) -> Dict: + """Execute a test case in a specified virtual environment and compare the normalized output with the normalized expected result. + + Args: + package_name: The name of the package being tested. + venv_name: The name of the virtual environment where the test case will be executed. + test_type: The type of the test including "import test", "functional test" and "gpu test". + test_case: The test case code. + expected_result: The expected test-case result, as determined by the regex. + + Returns: + A dictionary like that: + { + "test_type": "", + "test_case": "", + "status": "PASS" or "FAIL", + "actual_output": "", + "expected_output": "", + "stderr": "", + "return_code": , + "execution_time": "