collect.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334
  1. #
  2. # Copyright (c) 2021 Project CHIP Authors
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. """Collect information from various sources into Memory Map DataFrames."""
  17. import bisect
  18. from typing import Callable, Dict, List, Mapping, Optional, Sequence, Tuple
  19. import memdf.collector.bloaty
  20. import memdf.collector.csv
  21. import memdf.collector.elftools
  22. import memdf.collector.readelf
  23. import memdf.collector.su
  24. import memdf.name
  25. import memdf.select
  26. import memdf.util.config
  27. import pandas as pd # type: ignore
  28. from elftools.elf.constants import SH_FLAGS # type: ignore
  29. from memdf import DF, Config, ConfigDescription, DFs, ExtentDF, SectionDF, SymbolDF
  30. from memdf.collector.util import simplify_source
  31. PREFIX_CONFIG: ConfigDescription = {
  32. 'collect.prefix': {
  33. 'help': 'Strip PATH from the beginning of source file names',
  34. 'metavar': 'PATH',
  35. 'default': [],
  36. 'argparse': {
  37. 'alias': ['--prefix', '--strip-prefix'],
  38. 'action': 'append',
  39. }
  40. },
  41. }
  42. CONFIG: ConfigDescription = {
  43. Config.group_def('input'): {
  44. 'title': 'input options',
  45. },
  46. Config.group_def('tool'): {
  47. 'title': 'external tool options',
  48. },
  49. Config.group_map('collect'): {
  50. 'group': 'input'
  51. },
  52. **memdf.collector.bloaty.CONFIG,
  53. **memdf.collector.csv.CONFIG,
  54. **memdf.collector.elftools.CONFIG,
  55. **memdf.collector.readelf.CONFIG,
  56. 'collect.method': {
  57. 'help':
  58. 'Method of input processing: one of'
  59. ' elftools, readelf, bloaty, csv, tsv, su.',
  60. 'metavar': 'METHOD',
  61. 'choices': ['elftools', 'readelf', 'bloaty', 'csv', 'tsv', 'su'],
  62. 'default': 'elftools',
  63. 'argparse': {
  64. 'alias': ['-f'],
  65. },
  66. },
  67. **PREFIX_CONFIG,
  68. }
  69. ARM_SPECIAL_SYMBOLS = frozenset(["$a", "$t", "$t.x", "$d", "$d.realdata"])
  70. def postprocess_symbols(config: Config, symbols: SymbolDF) -> SymbolDF:
  71. """Postprocess a symbol table after collecting from one source.
  72. If the symbol table contains FILE symbols, they will be removed and
  73. replaced by a 'file' column on other symbols.
  74. If the symbol table contains ARM mode symbols, they will be removed
  75. and replaced by an 'arm' column on other symbols.
  76. """
  77. files = []
  78. arms = []
  79. arm_symbols = {}
  80. current_file = ''
  81. current_arm = ''
  82. has_file = False
  83. if config['collect.prefix-file']:
  84. prefixes = config.get_re('collect.prefix')
  85. else:
  86. prefixes = None
  87. if 'type' in symbols.columns:
  88. for symbol in symbols.itertuples():
  89. if symbol.type == 'FILE':
  90. has_file = True
  91. current_file = symbol.symbol
  92. if prefixes:
  93. current_file = simplify_source(current_file, prefixes)
  94. elif symbol.type == 'NOTYPE':
  95. if symbol.symbol.startswith('$'):
  96. if current_arm or symbol.symbol in ARM_SPECIAL_SYMBOLS:
  97. current_arm = symbol.symbol
  98. arm_symbols[current_arm] = True
  99. files.append(current_file)
  100. arms.append(current_arm)
  101. if has_file:
  102. symbols['file'] = files
  103. if current_arm:
  104. symbols['arm'] = arms
  105. if has_file:
  106. symbols = symbols[symbols['type'] != 'FILE']
  107. if current_arm:
  108. syms = arm_symbols.keys()
  109. symbols = symbols[~symbols.symbol.isin(syms)]
  110. return symbols
  111. def postprocess_file(config: Config, dfs: DFs) -> None:
  112. """Postprocess tables after collecting from one source."""
  113. if SymbolDF.name in dfs:
  114. dfs[SymbolDF.name] = postprocess_symbols(config, dfs[SymbolDF.name])
  115. def fill_holes(config: Config, symbols: SymbolDF, sections: SectionDF) -> DFs:
  116. """Account for space not used by any symbol, or by multiple symbols."""
  117. # These symbols mark the start or end of unused space.
  118. start_unused = frozenset(config.get('symbol.free.start', []))
  119. end_unused = frozenset(config.get('symbol.free.end', []))
  120. extent_columns = ['address', 'size', 'section', 'file']
  121. need_cu = 'cu' in symbols.columns
  122. if need_cu:
  123. extent_columns.append('cu')
  124. need_input = 'input' in symbols.columns
  125. if need_input:
  126. extent_columns.append('input')
  127. columns = ['symbol', *extent_columns, 'type', 'bind']
  128. def filler(name, address, size, previous, current) -> List:
  129. row = [
  130. name, # symbol
  131. address, # address
  132. size, # size
  133. (previous.section if previous else
  134. current.section if current else memdf.name.UNDEF), # section
  135. (previous.file
  136. if previous else current.file if current else ''), # file
  137. ]
  138. if need_cu:
  139. row.append(
  140. previous.cu if previous else current.cu if current else '')
  141. if need_input:
  142. row.append(previous.input if previous else current.
  143. input if current else '')
  144. row.append('NOTYPE') # type
  145. row.append('LOCAL') # bind
  146. return row
  147. def fill_gap(previous, current, from_address,
  148. to_address) -> Tuple[str, List]:
  149. """Add a row for a unaccounted gap or unused space."""
  150. size = to_address - from_address
  151. if (previous is None or previous.symbol in start_unused
  152. or current.symbol in end_unused):
  153. use = 'unused'
  154. name = memdf.name.unused(from_address, size)
  155. else:
  156. use = 'gap'
  157. name = memdf.name.gap(from_address, size)
  158. return (use, filler(name, from_address, size, previous, current))
  159. def fill_overlap(previous, current, from_address,
  160. to_address) -> Tuple[str, List]:
  161. """Add a row for overlap."""
  162. size = to_address - from_address
  163. return ('overlap',
  164. filler(memdf.name.overlap(from_address, -size), from_address,
  165. size, previous, current))
  166. # Find the address range for sections that are configured or allocated.
  167. config_sections = set()
  168. for _, s in config.get('region.sections', {}).items():
  169. config_sections |= set(s)
  170. section_to_range = {}
  171. start_to_section = {}
  172. section_starts = [0]
  173. for s in sections.itertuples():
  174. if ((s.section in config_sections) or (s.flags & SH_FLAGS.SHF_ALLOC)):
  175. section_to_range[s.section] = range(s.address, s.address + s.size)
  176. start_to_section[s.address] = s.section
  177. section_starts.append(s.address)
  178. section_starts.sort()
  179. new_symbols: Dict[str, List[list]] = {
  180. 'gap': [],
  181. 'unused': [],
  182. 'overlap': []
  183. }
  184. section_range = None
  185. previous_symbol = None
  186. current_address = 0
  187. iterable_symbols = symbols.loc[(symbols.type != 'SECTION')
  188. & (symbols.type != 'FILE')
  189. & symbols.section.isin(section_to_range)]
  190. iterable_symbols = iterable_symbols.sort_values(by='address')
  191. for symbol in iterable_symbols.itertuples():
  192. if not previous_symbol or symbol.section != previous_symbol.section:
  193. # We sometimes see symbols that have the value of their section end
  194. # address (so they are not actually within the section) and have
  195. # the same address as a symbol in the next section.
  196. symbol_address_section = start_to_section.get(section_starts[
  197. bisect.bisect_right(section_starts, symbol.address) - 1])
  198. if symbol_address_section != symbol.section:
  199. continue
  200. # Starting or switching sections.
  201. if previous_symbol and section_range:
  202. # previous_symbol is the last in its section.
  203. if current_address < section_range[-1] + 1:
  204. use, row = fill_gap(previous_symbol, previous_symbol,
  205. current_address, section_range[-1] + 1)
  206. new_symbols[use].append(row)
  207. # Start of section.
  208. previous_symbol = None
  209. section_range = section_to_range.get(symbol.section)
  210. if section_range:
  211. current_address = section_range[0]
  212. if section_range:
  213. if current_address < symbol.address:
  214. use, row = fill_gap(previous_symbol, symbol, current_address,
  215. symbol.address)
  216. new_symbols[use].append(row)
  217. elif current_address > symbol.address:
  218. use, row = fill_overlap(previous_symbol, symbol,
  219. current_address, symbol.address)
  220. new_symbols[use].append(row)
  221. current_address = symbol.address + symbol.size
  222. previous_symbol = symbol
  223. dfs = {k: SymbolDF(new_symbols[k], columns=columns) for k in new_symbols}
  224. symbols = pd.concat([symbols, *dfs.values()]).fillna('')
  225. symbols.sort_values(by='address', inplace=True)
  226. for k in dfs:
  227. dfs[k] = ExtentDF(dfs[k][extent_columns])
  228. dfs[k].attrs['name'] = k
  229. dfs[SymbolDF.name] = SymbolDF(symbols)
  230. return dfs
  231. def postprocess_collected(config: Config, dfs: DFs) -> None:
  232. """Postprocess tables after reading all sources."""
  233. # Prune tables according to configuration options. This happens before
  234. # fill_holes() so that space of any pruned symbols will be accounted for,
  235. # and to avoid unnecessary work for pruned sections.
  236. for c in [SymbolDF, SectionDF]:
  237. if c.name in dfs:
  238. dfs[c.name] = memdf.select.select_configured(
  239. config, dfs[c.name], memdf.select.COLLECTED_CHOICES)
  240. # Account for space not used by any symbol, or by multiple symbols.
  241. if (SymbolDF.name in dfs and SectionDF.name in dfs
  242. and config.get('args.fill_holes', True)):
  243. dfs.update(fill_holes(config, dfs[SymbolDF.name], dfs[SectionDF.name]))
  244. # Create synthetic columns (e.g. 'region') and prune tables
  245. # according to their configuration. This happens after fill_holes()
  246. # so that synthetic column values will be created for the gap symbols.
  247. for c in [SymbolDF, SectionDF]:
  248. if c.name in dfs:
  249. for column in memdf.select.SYNTHETIC_CHOICES:
  250. dfs[c.name] = memdf.select.synthesize_column(
  251. config, dfs[c.name], column)
  252. dfs[c.name] = memdf.select.select_configured_column(
  253. config, dfs[c.name], column)
  254. for df in dfs.values():
  255. if demangle := set((c for c in df.columns if c.endswith('symbol'))):
  256. df.attrs['demangle'] = demangle
  257. if hexify := set((c for c in df.columns if c.endswith('address'))):
  258. df.attrs['hexify'] = hexify
  259. FileReader = Callable[[Config, str, str], DFs]
  260. FILE_READERS: Dict[str, FileReader] = {
  261. 'bloaty': memdf.collector.bloaty.read_file,
  262. 'elftools': memdf.collector.elftools.read_file,
  263. 'readelf': memdf.collector.readelf.read_file,
  264. 'csv': memdf.collector.csv.read_file,
  265. 'tsv': memdf.collector.csv.read_file,
  266. 'su': memdf.collector.su.read_dir,
  267. }
  268. def collect_files(config: Config,
  269. files: Optional[List[str]] = None,
  270. method: Optional[str] = None) -> DFs:
  271. """Read a filtered memory map from a set of files."""
  272. filenames = files if files else config.get('args.inputs', [])
  273. if method is None:
  274. method = config.get('collect.method', 'csv')
  275. frames: Dict[str, List[DF]] = {}
  276. for filename in filenames:
  277. dfs: DFs = FILE_READERS[method](config, filename, method)
  278. postprocess_file(config, dfs)
  279. for k, frame in dfs.items():
  280. if k not in frames:
  281. frames[k] = []
  282. frames[k].append(frame)
  283. dfs = {}
  284. for k, v in frames.items():
  285. dfs[k] = pd.concat(v, ignore_index=True)
  286. postprocess_collected(config, dfs)
  287. return dfs
  288. def parse_args(config_desc: Mapping, argv: Sequence[str]) -> Config:
  289. """Common argument parsing for collection tools."""
  290. config = Config().init({
  291. **memdf.util.config.CONFIG,
  292. **CONFIG,
  293. **config_desc
  294. })
  295. config.argparse.add_argument('inputs', metavar='FILE', nargs='+')
  296. return config.parse(argv)