summarize_fail.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import logging
  2. import os
  3. import subprocess
  4. import pandas as pd
  5. from slugify import slugify
  6. error_catalog = {
  7. "CodeQL": {
  8. "No space left on device": {
  9. "short": "Ran out of space",
  10. "detail": "Exception with signature \"No space left on device\""
  11. },
  12. "Check that the disk containing the database directory has ample free space.": {
  13. "short": "Ran out of space",
  14. "detail": "Fatal internal error with message indicating that disk space most likely ran out"
  15. }
  16. },
  17. "Build example": {
  18. "Could not find a version that satisfies the requirement": {
  19. "short": "Requirements issue",
  20. "detail": "Unable to install a requirements in Python requirements.txt"
  21. },
  22. "No module named": {
  23. "short": "Missing module",
  24. "detail": "Expected module was missing"
  25. }
  26. },
  27. "Full builds": {
  28. "No space left on device": {
  29. "short": "Ran out of space",
  30. "detail": "Exception with signature \"No space left on device\""
  31. }
  32. }
  33. }
  34. def process_fail(id, pr, start_time, workflow):
  35. logging.info(f"Processing failure in {pr}, workflow {workflow} that started at {start_time}.")
  36. logging.info("Building output file structure.")
  37. output_path = f"recent_fails_logs/{slugify(pr)}/{slugify(workflow)}/{slugify(start_time)}"
  38. os.makedirs(output_path)
  39. logging.info("Gathering raw fail logs.")
  40. subprocess.run(f"gh run view -R project-chip/connectedhomeip {id} --log-failed > {output_path}/fail_log.txt", shell=True)
  41. # Eventually turn this into a catalog of error messages per workflow
  42. logging.info("Collecting info on likely cause of failure.")
  43. root_cause = "Unknown cause"
  44. with open(f"{output_path}/fail_log.txt") as fail_log_file:
  45. fail_log = fail_log_file.read()
  46. workflow_category = workflow.split(" - ")[0]
  47. if workflow_category in error_catalog:
  48. for error_message in error_catalog[workflow_category]:
  49. if error_message in fail_log:
  50. root_cause = error_catalog[workflow_category][error_message]["short"]
  51. break
  52. logging.info(f"Checking recent pass/fail rate of workflow {workflow}.")
  53. workflow_fail_rate_output_path = f"workflow_pass_rate/{slugify(workflow)}"
  54. if not os.path.exists(workflow_fail_rate_output_path):
  55. os.makedirs(workflow_fail_rate_output_path)
  56. subprocess.run(
  57. f"gh run list -R project-chip/connectedhomeip -b master -w '{workflow}' --json conclusion > {workflow_fail_rate_output_path}/run_list.json", shell=True)
  58. else:
  59. logging.info("This workflow has already been processed.")
  60. return [pr, workflow, root_cause]
  61. def main():
  62. logging.info("Gathering recent fails information into run_list.json.")
  63. subprocess.run("gh run list -R project-chip/connectedhomeip -b master -s failure --json databaseId,displayTitle,startedAt,workflowName > run_list.json", shell=True)
  64. logging.info("Reading run_list.json into a DataFrame.")
  65. df = pd.read_json("run_list.json")
  66. logging.info("Listing recent fails.")
  67. df.columns = ["ID", "Pull Request", "Start Time", "Workflow"]
  68. print("Recent Fails:")
  69. print(df.to_string(columns=["Pull Request", "Workflow"], index=False))
  70. print()
  71. df.to_csv("recent_fails.csv", index=False)
  72. logging.info("Listing frequency of recent fails by workflow.")
  73. frequency = df["Workflow"].value_counts(normalize=True).mul(100).round().astype(
  74. str).reset_index(name="Percentage") # Reformat this from "50.0" to "50%"
  75. print("Share of Recent Fails by Workflow:")
  76. print(frequency.to_string(index=False))
  77. print()
  78. frequency.to_csv("recent_workflow_fails_frequency.csv")
  79. logging.info("Conducting fail information parsing.")
  80. root_causes = df.apply(lambda row: process_fail(row["ID"], row["Pull Request"],
  81. row["Start Time"], row["Workflow"]), axis=1, result_type="expand")
  82. root_causes.columns = ["Pull Request", "Workflow", "Cause of Failure"]
  83. print("Likely Root Cause of Recent Fails:")
  84. print(root_causes.to_string(index=False))
  85. print()
  86. root_causes.to_csv("failure_cause_summary.csv")
  87. logging.info("Listing percent fail rate of recent fails by workflow.")
  88. fail_rate = {}
  89. for workflow in next(os.walk("workflow_pass_rate"))[1]:
  90. try:
  91. info = pd.read_json(f"workflow_pass_rate/{workflow}/run_list.json")
  92. info = info[info["conclusion"].str.len() > 0]
  93. fail_rate[workflow] = [info.value_counts(normalize=True).mul(100).round()["failure"]]
  94. except Exception:
  95. logging.exception(f"Recent runs info for {workflow} was not collected.")
  96. fail_rate = pd.DataFrame.from_dict(fail_rate, 'index', columns=["Fail Rate"])
  97. print("Recent Fail Rate of Each Workflow:")
  98. print(fail_rate.to_string())
  99. fail_rate.to_csv("workflow_fail_rate.csv")
  100. if __name__ == "__main__":
  101. main()