关于什么ansible,我这里就不做科普了,总之一句话,要做分布式系统的运维,实现批量系统配置、批量程序部署、批量运行命令等功能,ansible就是一大杀器,能令你事半功倍。
但作为一个cli工具,其使用场景还是受cli的限制,无法实现运行过程中更深入的交互和逻辑控制。ansible本身是用python做的,所以实际上是和python的脚本控制是无缝链接的,可以在python中直接使用。但怎奈关于这部分的内容,官网文档中本身就少得可怜,互联网上也少有关于这一块的介绍,所以这个博文试着抛砖引玉,为大家做个指引。
首先,我不是python大牛,没法直接从代码入手一行一行的去看整个ansible的架构和代码,并且官网上也说了,ansible的代码一直在重构(从网上找到的屈指可数的例子也证明了这一点,那些python里面调用ansible的例子,在新版本的ansbile中一个都跑不了),因此,我们需要的是一个思路: 从CLI开始入手
ansible playbook 的调用
我们以ansible-playbook为例子,说明一下怎么从CLI开始一步一步的找到方法在python中调用ansible
找到命令路径
$ which ansible-playbook
/Library/Frameworks/Python.framework/Versions/3.6/bin/ansible-playbook
找到源码并分析
通过以上命令找到ansible-playbook。打开文件(文件内容不用细看):
#!/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6
# (c) 2012, Michael DeHaan <michael.dehaan@gmail.com>
#
# This file is part of Ansible
#
# Ansible is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ansible is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
########################################################
from __future__ import (absolute_import, division, print_function)
__metaclass__ = type
__requires__ = ['ansible']
import os
import shutil
import sys
import traceback
from ansible import context
from ansible.errors import AnsibleError, AnsibleOptionsError, AnsibleParserError
from ansible.module_utils._text import to_text
# Used for determining if the system is running a new enough python version
# and should only restrict on our documented minimum versions
_PY3_MIN = sys.version_info[:2] >= (3, 5)
_PY2_MIN = (2, 6) <= sys.version_info[:2] < (3,)
_PY_MIN = _PY3_MIN or _PY2_MIN
if not _PY_MIN:
raise SystemExit('ERROR: Ansible requires a minimum of Python2 version 2.6 or Python3 version 3.5. Current version: %s' % ''.join(sys.version.splitlines()))
class LastResort(object):
# OUTPUT OF LAST RESORT
def display(self, msg, log_only=None):
print(msg, file=sys.stderr)
def error(self, msg, wrap_text=None):
print(msg, file=sys.stderr)
if __name__ == '__main__':
display = LastResort()
try: # bad ANSIBLE_CONFIG or config options can force ugly stacktrace
import ansible.constants as C
from ansible.utils.display import Display
except AnsibleOptionsError as e:
display.error(to_text(e), wrap_text=False)
sys.exit(5)
cli = None
me = os.path.basename(sys.argv[0])
try:
display = Display()
display.debug("starting run")
sub = None
target = me.split('-')
if target[-1][0].isdigit():
# Remove any version or python version info as downstreams
# sometimes add that
target = target[:-1]
if len(target) > 1:
sub = target[1]
myclass = "%sCLI" % sub.capitalize()
elif target[0] == 'ansible':
sub = 'adhoc'
myclass = 'AdHocCLI'
else:
raise AnsibleError("Unknown Ansible alias: %s" % me)
try:
mycli = getattr(__import__("ansible.cli.%s" % sub, fromlist=[myclass]), myclass)
except ImportError as e:
# ImportError members have changed in py3
if 'msg' in dir(e):
msg = e.msg
else:
msg = e.message
if msg.endswith(' %s' % sub):
raise AnsibleError("Ansible sub-program not implemented: %s" % me)
else:
raise
try:
args = [to_text(a, errors='surrogate_or_strict') for a in sys.argv]
except UnicodeError:
display.error('Command line args are not in utf-8, unable to continue. Ansible currently only understands utf-8')
display.display(u"The full traceback was:\n\n%s" % to_text(traceback.format_exc()))
exit_code = 6
else:
cli = mycli(args)
exit_code = cli.run()
except AnsibleOptionsError as e:
cli.parser.print_help()
display.error(to_text(e), wrap_text=False)
exit_code = 5
except AnsibleParserError as e:
display.error(to_text(e), wrap_text=False)
exit_code = 4
# TQM takes care of these, but leaving comment to reserve the exit codes
# except AnsibleHostUnreachable as e:
# display.error(str(e))
# exit_code = 3
# except AnsibleHostFailed as e:
# display.error(str(e))
# exit_code = 2
except AnsibleError as e:
display.error(to_text(e), wrap_text=False)
exit_code = 1
except KeyboardInterrupt:
display.error("User interrupted execution")
exit_code = 99
except Exception as e:
if C.DEFAULT_DEBUG:
# Show raw stacktraces in debug mode, It also allow pdb to
# enter post mortem mode.
raise
have_cli_options = bool(context.CLIARGS)
display.error("Unexpected Exception, this is probably a bug: %s" % to_text(e), wrap_text=False)
if not have_cli_options or have_cli_options and context.CLIARGS['verbosity'] > 2:
log_only = False
if hasattr(e, 'orig_exc'):
display.vvv('\nexception type: %s' % to_text(type(e.orig_exc)))
why = to_text(e.orig_exc)
if to_text(e) != why:
display.vvv('\noriginal msg: %s' % why)
else:
display.display("to see the full traceback, use -vvv")
log_only = True
display.display(u"the full traceback was:\n\n%s" % to_text(traceback.format_exc()), log_only=log_only)
exit_code = 250
finally:
# Remove ansible tmpdir
shutil.rmtree(C.DEFAULT_LOCAL_TMP, True)
sys.exit(exit_code)
可以看到,大部分的内容都是异常的处理,跳过不看,找出最重要的几句:
...
if len(target) > 1:
sub = target[1]
myclass = "%sCLI" % sub.capitalize()
elif target[0] == 'ansible':
sub = 'adhoc'
myclass = 'AdHocCLI'
else:
raise AnsibleError("Unknown Ansible alias: %s" % me)
try:
mycli = getattr(__import__("ansible.cli.%s" % sub, fromlist=[myclass]), myclass)
...
cli = mycli(args)
exit_code = cli.run()
通过这几行代码,我们可以定位到ansible.cli
这个package的目录。下面支持多种CLI:
├── adhoc.py
├── arguments
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── optparse_helpers.cpython-36.pyc
│ └── optparse_helpers.py
├── config.py
├── console.py
├── doc.py
├── galaxy.py
├── inventory.py
├── playbook.py
├── pull.py
└── vault.py
并且调用的方式也很简单,只需要搞清楚args是什么就可以了,你可以简单的加一句print(args)
。
简化调用
最后以上代码可以简化为如下示例:
from ansible.cli.playbook import PlaybookCLI
mycli = PlaybookCLI
cli = mycli([" ",'-i', 'hosts.uat', 'kibana_deploy_plugin.yml'])
exit_code = cli.run()
注意,这里参数的方式['-i', 'hosts.uat', 'kibana_deploy_plugin.yml']
,格式和我们平时运行ansible-playbook
一样,只是需要以数组的方式提供。
运行一下,结果如下:
PLAY [Deploy kibana optimize] **************************************************
TASK [Gathering Facts] *********************************************************
fatal: [BI-LASS-Kibana_10.60.x.x]: UNREACHABLE! => {"changed": false, "msg": "Failed to connect to the host via ssh: ssh: connect to host 10.60.x.x port 22: Operation timed out", "unreachable": true}
fatal: [BI-LASS-Kibana_10.50.x.x]: UNREACHABLE! => {"changed": false, "msg": "Failed to connect to the host via ssh: ssh: connect to host 10.50.x.x port 22: Operation timed out", "unreachable": true}
PLAY RECAP *********************************************************************
BI-LASS-Kibana_10.60.x.x : ok=0 changed=0 unreachable=1 failed=0 skipped=0 rescued=0 ignored=0
BI-LASS-Kibana_10.50.x.x : ok=0 changed=0 unreachable=1 failed=0 skipped=0 rescued=0 ignored=0
Process finished with exit code 0
可见,脚本能正常运行,到这里,我们已经可以在python中调用ansible-playbook了。
调用后的交互
这还不够,我们需要交互,需要得到task运行的结果,并根据结果做额外的分析和逻辑处理,因此需要更深入的研究代码。
ansible的运行分析
cli.run()
先看看的cli.run()
函数:
def run(self):
super(PlaybookCLI, self).run()
# Note: slightly wrong, this is written so that implicit localhost
# manages passwords
sshpass = None
becomepass = None
passwords = {}
# initial error check, to make sure all specified playbooks are accessible
# before we start running anything through the playbook executor
b_playbook_dirs = []
for playbook in context.CLIARGS['args']:
if not os.path.exists(playbook):
raise AnsibleError("the playbook: %s could not be found" % playbook)
if not (os.path.isfile(playbook) or stat.S_ISFIFO(os.stat(playbook).st_mode)):
raise AnsibleError("the playbook: %s does not appear to be a file" % playbook)
b_playbook_dir = os.path.dirname(os.path.abspath(to_bytes(playbook, errors='surrogate_or_strict')))
# load plugins from all playbooks in case they add callbacks/inventory/etc
add_all_plugin_dirs(b_playbook_dir)
b_playbook_dirs.append(b_playbook_dir)
set_collection_playbook_paths(b_playbook_dirs)
# don't deal with privilege escalation or passwords when we don't need to
if not (context.CLIARGS['listhosts'] or context.CLIARGS['listtasks'] or
context.CLIARGS['listtags'] or context.CLIARGS['syntax']):
(sshpass, becomepass) = self.ask_passwords()
passwords = {'conn_pass': sshpass, 'become_pass': becomepass}
# create base objects
loader, inventory, variable_manager = self._play_prereqs()
# (which is not returned in list_hosts()) is taken into account for
# warning if inventory is empty. But it can't be taken into account for
# checking if limit doesn't match any hosts. Instead we don't worry about
# limit if only implicit localhost was in inventory to start with.
#
# Fix this when we rewrite inventory by making localhost a real host (and thus show up in list_hosts())
CLI.get_host_list(inventory, context.CLIARGS['subset'])
# flush fact cache if requested
if context.CLIARGS['flush_cache']:
self._flush_cache(inventory, variable_manager)
# create the playbook executor, which manages running the plays via a task queue manager
pbex = PlaybookExecutor(playbooks=context.CLIARGS['args'], inventory=inventory,
variable_manager=variable_manager, loader=loader,
passwords=passwords)
results = pbex.run()
if isinstance(results, list):
for p in results:
display.display('\nplaybook: %s' % p['playbook'])
for idx, play in enumerate(p['plays']):
if play._included_path is not None:
loader.set_basedir(play._included_path)
else:
pb_dir = os.path.realpath(os.path.dirname(p['playbook']))
loader.set_basedir(pb_dir)
msg = "\n play #%d (%s): %s" % (idx + 1, ','.join(play.hosts), play.name)
mytags = set(play.tags)
msg += '\tTAGS: [%s]' % (','.join(mytags))
if context.CLIARGS['listhosts']:
playhosts = set(inventory.get_hosts(play.hosts))
msg += "\n pattern: %s\n hosts (%d):" % (play.hosts, len(playhosts))
for host in playhosts:
msg += "\n %s" % host
display.display(msg)
all_tags = set()
if context.CLIARGS['listtags'] or context.CLIARGS['listtasks']:
taskmsg = ''
if context.CLIARGS['listtasks']:
taskmsg = ' tasks:\n'
def _process_block(b):
taskmsg = ''
for task in b.block:
if isinstance(task, Block):
taskmsg += _process_block(task)
else:
if task.action == 'meta':
continue
all_tags.update(task.tags)
if context.CLIARGS['listtasks']:
cur_tags = list(mytags.union(set(task.tags)))
cur_tags.sort()
if task.name:
taskmsg += " %s" % task.get_name()
else:
taskmsg += " %s" % task.action
taskmsg += "\tTAGS: [%s]\n" % ', '.join(cur_tags)
return taskmsg
all_vars = variable_manager.get_vars(play=play)
for block in play.compile():
block = block.filter_tagged_tasks(all_vars)
if not block.has_tasks():
continue
taskmsg += _process_block(block)
if context.CLIARGS['listtags']:
cur_tags = list(mytags.union(all_tags))
cur_tags.sort()
taskmsg += " TASK TAGS: [%s]\n" % ', '.join(cur_tags)
display.display(taskmsg)
return 0
else:
return results
这个函数仍然很长,关键是:
# create the playbook executor, which manages running the plays via a task queue manager
pbex = PlaybookExecutor(playbooks=context.CLIARGS['args'], inventory=inventory,
variable_manager=variable_manager, loader=loader,
passwords=passwords)
results = pbex.run()
这里用PlaybookExecutor
对执行过程进行了封装,并且使用task queue manager
。
PlaybookExecutor.run()
继续跟进PlaybookExecutor.run()
,我们可以看到关键代码是:
class PlaybookExecutor:
'''
This is the primary class for executing playbooks, and thus the
basis for bin/ansible-playbook operation.
'''
def __init__(self, playbooks, inventory, variable_manager, loader, passwords):
self._playbooks = playbooks
self._inventory = inventory
self._variable_manager = variable_manager
self._loader = loader
self.passwords = passwords
self._unreachable_hosts = dict()
if context.CLIARGS.get('listhosts') or context.CLIARGS.get('listtasks') or \
context.CLIARGS.get('listtags') or context.CLIARGS.get('syntax'):
self._tqm = None
else:
self._tqm = TaskQueueManager(
inventory=inventory,
variable_manager=variable_manager,
loader=loader,
passwords=self.passwords,
forks=context.CLIARGS.get('forks'),
)
...
def run(self):
'''
Run the given playbook, based on the settings in the play which
may limit the runs to serialized groups, etc.
'''
...
if self._tqm is None:
# we are just doing a listing
entry['plays'].append(play)
else:
self._tqm._unreachable_hosts.update(self._unreachable_hosts)
previously_failed = len(self._tqm._failed_hosts)
previously_unreachable = len(self._tqm._unreachable_hosts)
break_play = False
# we are actually running plays
batches = self._get_serialized_batches(play)
if len(batches) == 0:
self._tqm.send_callback('v2_playbook_on_play_start', play)
self._tqm.send_callback('v2_playbook_on_no_hosts_matched')
for batch in batches:
# restrict the inventory to the hosts in the serialized batch
self._inventory.restrict_to_hosts(batch)
# and run it...
result = self._tqm.run(play=play)
...
可以看到有几点:
-
如果参数包含
listhosts
、listtasks
、listtags
、syntax
则不会真正运行,而是返回playbook的信息 -
如果需要运行,则通过
TaskQueueManager
,以forks
的方式进行批量操作(这样的并发并非线程安全的) -
我们需要进一步到
_tqm.run(play=play)
函数中进行观察
TaskQueueManager.run()
从以下代码可以看到几点:
-
ansible在TaskQueueManager才去遍历一个playbook下的roles和tasks
-
默认的strategy是linear strategy
-
当task在所有host上都执行完毕之后才会进入到下一个task
def run(self, play):
'''
Iterates over the roles/tasks in a play, using the given (or default)
strategy for queueing tasks. The default is the linear strategy, which
operates like classic Ansible by keeping all hosts in lock-step with
a given task (meaning no hosts move on to the next task until all hosts
are done with the current task).
'''
if not self._callbacks_loaded:
self.load_callbacks()
all_vars = self._variable_manager.get_vars(play=play)
warn_if_reserved(all_vars)
templar = Templar(loader=self._loader, variables=all_vars)
new_play = play.copy()
new_play.post_validate(templar)
new_play.handlers = new_play.compile_roles_handlers() + new_play.handlers
self.hostvars = HostVars(
inventory=self._inventory,
variable_manager=self._variable_manager,
loader=self._loader,
)
play_context = PlayContext(new_play, self.passwords, self._connection_lockfile.fileno())
if (self._stdout_callback and
hasattr(self._stdout_callback, 'set_play_context')):
self._stdout_callback.set_play_context(play_context)
for callback_plugin in self._callback_plugins:
if hasattr(callback_plugin, 'set_play_context'):
callback_plugin.set_play_context(play_context)
self.send_callback('v2_playbook_on_play_start', new_play)
# build the iterator
iterator = PlayIterator(
inventory=self._inventory,
play=new_play,
play_context=play_context,
variable_manager=self._variable_manager,
all_vars=all_vars,
start_at_done=self._start_at_done,
)
# adjust to # of workers to configured forks or size of batch, whatever is lower
self._initialize_processes(min(self._forks, iterator.batch_size))
# load the specified strategy (or the default linear one)
strategy = strategy_loader.get(new_play.strategy, self)
if strategy is None:
raise AnsibleError("Invalid play strategy specified: %s" % new_play.strategy, obj=play._ds)
# Because the TQM may survive multiple play runs, we start by marking
# any hosts as failed in the iterator here which may have been marked
# as failed in previous runs. Then we clear the internal list of failed
# hosts so we know what failed this round.
for host_name in self._failed_hosts.keys():
host = self._inventory.get_host(host_name)
iterator.mark_host_failed(host)
self.clear_failed_hosts()
# during initialization, the PlayContext will clear the start_at_task
# field to signal that a matching task was found, so check that here
# and remember it so we don't try to skip tasks on future plays
if context.CLIARGS.get('start_at_task') is not None and play_context.start_at_task is None:
self._start_at_done = True
# and run the play using the strategy and cleanup on way out
play_return = strategy.run(iterator, play_context)
# now re-save the hosts that failed from the iterator to our internal list
for host_name in iterator.get_failed_hosts():
self._failed_hosts[host_name] = True
strategy.cleanup()
self._cleanup_processes()
return play_return
因此,我们仍然需要到strategy.run()
一探究竟
strategy.run()
通过这部分代码
def run(self, iterator, play_context):
'''
The linear strategy is simple - get the next task and queue
it for all hosts, then wait for the queue to drain before
moving on to the next task
'''
# iterate over each task, while there is one left to run
result = self._tqm.RUN_OK
work_to_do = True
while work_to_do and not self._tqm._terminated:
try:
display.debug("getting the remaining hosts for this loop")
hosts_left = self.get_hosts_left(iterator)
display.debug("done getting the remaining hosts for this loop")
# queue up this task for each host in the inventory
callback_sent = False
work_to_do = False
host_results = []
host_tasks = self._get_next_task_lockstep(hosts_left, iterator)
...
results += self._process_pending_results(iterator, max_passes=max(1, int(len(self._tqm._workers) * 0.1)))
...
return super(StrategyModule, self).run(iterator, play_context, result)
(这个函数太长,我省略了大部分)
这里最终的一句就是results += self._process_pending_results(iterator, max_passes=max(1, int(len(self._tqm._workers) * 0.1)))
,即我们是通过_process_pending_results
函数进行处理的,并且在函数里面使用大量的回调:
if task_result.is_failed() or task_result.is_unreachable():
self._tqm.send_callback('v2_runner_item_on_failed', task_result)
elif task_result.is_skipped():
self._tqm.send_callback('v2_runner_item_on_skipped', task_result)
else:
if 'diff' in task_result._result:
if self._diff or getattr(original_task, 'diff', False):
self._tqm.send_callback('v2_on_file_diff', task_result)
self._tqm.send_callback('v2_runner_item_on_ok', task_result)
这些回调来自于taskQueueManager
TaskQueueManager.send_callback()
从以下代码,我们可以看到这个回调可以来自于_stdout_callback
def send_callback(self, method_name, *args, **kwargs):
for callback_plugin in [self._stdout_callback] + self._callback_plugins:
# a plugin that set self.disabled to True will not be called
# see osx_say.py example for such a plugin
if getattr(callback_plugin, 'disabled', False):
continue
# try to find v2 method, fallback to v1 method, ignore callback if no method found
methods = []
for possible in [method_name, 'v2_on_any']:
gotit = getattr(callback_plugin, possible, None)
if gotit is None:
gotit = getattr(callback_plugin, possible.replace('v2_', ''), None)
if gotit is not None:
methods.append(gotit)
因此,只要我们能重写TaskQueueManager的stdout_callback,就可以获取中间结果
最终的代码
省略中间分析步骤,直接上代码
from ansible.cli.playbook import PlaybookCLI
from ansible.plugins.callback import CallbackBase
import json
from ansible.cli import CLI
from ansible.executor.playbook_executor import PlaybookExecutor
from ansible import context
from ansible import constants as C
class ResultCallback(CallbackBase):
"""A sample callback plugin used for performing an action as results come in
If you want to collect all results into a single object for processing at
the end of the execution, look into utilizing the ``json`` callback plugin
or writing your own custom callback plugin
"""
def v2_runner_on_ok(self, result, **kwargs):
"""Print a json representation of the result
This method could store the result in an instance attribute for retrieval later
"""
host = result._host
print(json.dumps({host.name: result._result}, indent=4))
def v2_runner_on_failed(self, result, **kwargs):
host = result._host.get_name()
self.runner_on_failed(host, result._result, False)
print('===v2_runner_on_failed====host=%s===result=%s'%(host,result._result))
def v2_runner_on_unreachable(self, result):
host = result._host.get_name()
self.runner_on_unreachable(host, result._result)
print('===v2_runner_on_unreachable====host=%s===result=%s'%(host,result._result))
def v2_runner_on_skipped(self, result):
if C.DISPLAY_SKIPPED_HOSTS:
host = result._host.get_name()
self.runner_on_skipped(host, self._get_item(getattr(result._result,'results',{})))
print("this task does not execute,please check parameter or condition.")
def v2_playbook_on_stats(self, stats):
print('===========play executes completed========')
cli = PlaybookCLI([" ",'-i', 'hosts.uat', 'kibana_deploy_plugin.yml'])
super(PlaybookCLI,cli).run()
loader, inventory, variable_manager = cli._play_prereqs()
CLI.get_host_list(inventory, context.CLIARGS['subset'])
pbex = PlaybookExecutor(playbooks=context.CLIARGS['args'], inventory=inventory,
variable_manager=variable_manager, loader=loader,
passwords=None)
pbex._tqm._stdout_callback = ResultCallback()
pbex.run()
运行一下,so nice…
===v2_runner_on_unreachable====host=BI-LASS-Kibana_10.60.x.x===result={'unreachable': True, 'msg': 'Failed to connect to the host via ssh: ssh: connect to host 10.60.x.x port 22: Operation timed out', 'changed': False}
===v2_runner_on_unreachable====host=BI-LASS-Kibana_10.60.x.x===result={'unreachable': True, 'msg': 'Failed to connect to the host via ssh: ssh: connect to host 10.60.x.x port 22: Operation timed out', 'changed': False}
===========play executes completed========
共同学习,写下你的评论
评论加载中...
作者其他优质文章