前言

停更太久了，倒也不是多忙，主要是学习的热情降低了，又比较懒，因此，即使有做出新的玩意或者有所收获，也懒得去码字。最近做了一个百度AI的web端玩具，可以人脸识别/注册/颜值打分/手势识别，最骚的是可以通过语音进行相关指令的控制，大概就长下面这样。人脸识别部分包括人脸注册/人脸1：n识别，颜值打分这块，在上以一个文章：百度人脸识别HTTP SDk实战：基于C# ASP.NET CORE MVC 3.1 上讲的比较细了，颜值打分就是在之前的代码里，加一个参数，具体看百度文档就知道，这里不再进行论述，本文主要讲解手势识别以及语音识别部分代码逻辑。

手势识别

效果图如下：支持24种手势，具体：https://cloud.baidu.com/doc/BODY/s/Dk3cpyr8l

代码逻辑

这一块官方文档已经有很好的C#示例了，稍微修改一下就可以用了

控制器逻辑

 public string GetAccessToken()
        {
            string authHost = "https://aip.baidubce.com/oauth/2.0/token";
            HttpClient client = new HttpClient();
            List<KeyValuePair<String, String>> paraList = new List<KeyValuePair<string, string>>();
            paraList.Add(new KeyValuePair<string, string>("grant_type", "client_credentials"));
            paraList.Add(new KeyValuePair<string, string>("client_id", _configuration["BaiduAiConfig:BaiDuGestureRecon:ApiKey_Gesture"]));
            paraList.Add(new KeyValuePair<string, string>("client_secret",
                _configuration["BaiduAiConfig:BaiDuGestureRecon:SecretKey_Gesture"]));

            HttpResponseMessage response = client.PostAsync(authHost, new FormUrlEncodedContent(paraList)).Result;
            string result = response.Content.ReadAsStringAsync().Result;
            var resultJson = JsonConvert.DeserializeObject<JObject>(result);
            AccessToken = resultJson["access_token"].ToString();
            return AccessToken;
        }

        public IActionResult GestureFromWeb(string imgData64FromAjax)
        {
            GetAccessToken();
            string host = "https://aip.baidubce.com/rest/2.0/image-classify/v1/gesture?access_token=" + AccessToken;
            Encoding encoding = Encoding.Default;
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(host);
            request.Method = "post";
            request.KeepAlive = true;
            // 图片的base64编码
            //  string base64 = GetFileBase64("[本地图片文件]");
            string requestImgData64 = imgData64FromAjax;
            requestImgData64 = requestImgData64.Substring(requestImgData64.IndexOf(",") + 1);
            String str = "image=" + HttpUtility.UrlEncode(requestImgData64);
            byte[] buffer = encoding.GetBytes(str);
            request.ContentLength = buffer.Length;
            request.GetRequestStream().Write(buffer, 0, buffer.Length);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.Default);
            string result = reader.ReadToEnd();
            var resultJson = JsonConvert.DeserializeObject<JObject>(result);
            if (int.Parse(resultJson["result_num"].ToString()) != 0)
            {
                string gestureToken = resultJson["result"][0]["classname"].ToString();
                GestureResultDict resultDict = new GestureResultDict();
                try
                {
                    //这里写了一个手势映射的字典
                    string resultStr = resultDict.resultDict.FirstOrDefault(x => x.Key == gestureToken).Value;
                    if (!string.IsNullOrWhiteSpace(resultStr))
                    {
                        return Json(resultStr);
                    }
                    return Json("无法识别手势");
                }
                catch
                {
                    return Json("无法识别手势");
                }
            }
            return RedirectToAction("index", "home");
        }

手势映射字典


        private Dictionary<string, string> results = new Dictionary<string, string>()
        {
            {"Ok","Ok" },
            {"Six","数字6" },
            {"Rock","Rock" },
            {"Thumb_up","点赞" },
            {"One","数字1" },
            {"Five","数字5" },
            {"Fist","拳头" },
            {"Prayer","上天保佑" },
            {"Congratulation","恭喜恭喜" },
            {"Heart_single","笔芯" },
            {"Thumb_down","鄙视你" },
            {"ILY","黑凤梨" },
            { "Insult","竖中指"},
            { "Nine", "数字9" },
            { "Eight","数字8"},
            { "Seven","数字7"},
            { "Four","数字4"},
            { "Tow","数字2/Yeah"}
        };

这里主要是从前端传来base64的图片编码，前端逻辑采用了Jquery 。

 function reconGesture() {
        let video = document.getElementById("video");
        var canvas = $('#canvasGesture')[0];
        let ctx = canvas.getContext('2d');
        //
        canvas.height = 465;
        canvas.width = 400;
        ctx.drawImage(video, 0, 0, 400, 400);
        ctx.scale(-1, 1);
        var img = convertCanvasToImage(canvas);
        $.ajax({
            url: '/Gesture/GestureFromWeb',
            type: 'post',
            dataType: 'json',
            data: { "imgData64FromAjax": img.src },
            success: function (jsonStr) {
                var data = JSON.stringify(jsonStr);
                console.log(data);
                $("#gestureText").html("手势识别结果为：" + data);

            }
        })

        //let img = document.getElementById("canvas").toDataURL("image/png");
        //var triggerDownload = $("#downloadA").attr("href", img).attr("download", "micro-blog.png");
        //triggerDownload[0].click();
    }


  //从 canvas 提取图片 image
    function convertCanvasToImage(canvas) {
        //新Image对象，可以理解为DOM
        var image = new Image();
        // canvas.toDataURL 返回的是一串Base64编码的URL
        // 指定格式 PNG
        image.src = canvas.toDataURL("image/png");
        return image;
    }

语音控制

语音识别这块笔记麻烦一些，录音软件采用了Recorder.js作为插件进行录音上传，由于百度语音识别对于语音样本有如下要求：

原始 PCM 的录音参数必须符合 16k、8k 采样率、16bit 位深、单声道，支持的格式有：pcm（不压缩）、wav（不压缩，pcm编码）、amr（压缩格式）。所以对Recorder.js进行了一些代码上的修改进行配置。配置后的完整js代码文件：

Recorder.js完整代码

(function (f) { if (typeof exports === "object" && typeof module !== "undefined") { module.exports = f() } else if (typeof define === "function" && define.amd) { define([], f) } else { var g; if (typeof window !== "undefined") { g = window } else if (typeof global !== "undefined") { g = global } else if (typeof self !== "undefined") { g = self } else { g = this } g.Recorder = f() } })(function () {
var define, module, exports; return (function e(t, n, r) { function s(o, u) { if (!n[o]) { if (!t[o]) { var a = typeof require == "function" && require; if (!u && a) return a(o, !0); if (i) return i(o, !0); var f = new Error("Cannot find module '" + o + "'"); throw f.code = "MODULE_NOT_FOUND", f } var l = n[o] = { exports: {} }; t[o][0].call(l.exports, function (e) { var n = t[o][1][e]; return s(n ? n : e) }, l, l.exports, e, t, n, r) } return n[o].exports } var i = typeof require == "function" && require; for (var o = 0; o < r.length; o++)s(r[o]); return s })({
1: [function (require, module, exports) {
"use strict";
module.exports = require("./recorder").Recorder;
}, { "./recorder": 2 }], 2: [function (require, module, exports) {
'use strict';
var _createClass = (function () {
function defineProperties(target, props) {
for (var i = 0; i < props.length; i++) {
var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor);
}
} return function (Constructor, protoProps, staticProps) {
if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor;
};
})();
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.Recorder = undefined;
var _inlineWorker = require('inline-worker');
var _inlineWorker2 = _interopRequireDefault(_inlineWorker);
function _interopRequireDefault(obj) {
return obj && obj.__esModule ? obj : { default: obj };
}
function _classCallCheck(instance, Constructor) {
if (!(instance instanceof Constructor)) {
throw new TypeError("Cannot call a class as a function");
}
}
var Recorder = exports.Recorder = (function () {
function Recorder(source, cfg) {
var _this = this;
_classCallCheck(this, Recorder);
this.config = {
bufferLen: 4096,
numChannels: 2,
mimeType: 'audio_pcm/wav'
};
this.recording = false;
this.callbacks = {
getBuffer: [],
exportWAV: []
};
Object.assign(this.config, cfg);
this.context = source.context;
this.node = (this.context.createScriptProcessor || this.context.createJavaScriptNode).call(this.context, this.config.bufferLen, this.config.numChannels, this.config.numChannels);
this.node.onaudioprocess = function (e) {
if (!_this.recording) return;
var buffer = [];
for (var channel = 0; channel < _this.config.numChannels; channel++) {
buffer.push(e.inputBuffer.getChannelData(channel));
}
_this.worker.postMessage({
command: 'record',
buffer: buffer
});
};
source.connect(this.node);
this.node.connect(this.context.destination); //this should not be necessary
var self = {};
this.worker = new _inlineWorker2.default(function () {
var recLength = 0,
recBuffers = [],
sampleRate = undefined,
numChannels = undefined;
//  var sampleStep = this.context.sampleRate / sampleRate;
self.onmessage = function (e) {
switch (e.data.command) {
case 'init':
init(e.data.config);
break;
case 'record':
record(e.data.buffer);
break;
case 'exportWAV':
exportWAV(e.data.type);
break;
case 'getBuffer':
getBuffer();
break;
case 'clear':
clear();
break;
}
};
function init(config) {
sampleRate = config.sampleRate;
numChannels = config.numChannels;
initBuffers();
}
function record(inputBuffer) {
for (var channel = 0; channel < numChannels; channel++) {
recBuffers[channel].push(inputBuffer[channel]);
}
recLength += inputBuffer[0].length;
}
function exportWAV(type) {
var buffers = [];
for (var channel = 0; channel < numChannels; channel++) {
buffers.push(mergeBuffers(recBuffers[channel], recLength));
}
var interleaved = undefined;
if (numChannels === 2) {
interleaved = interleave(buffers[0], buffers[1]);
} else {
//处理单声道
interleaved = extractSingleChannel(buffers[0]);
}
var dataview = encodeWAV(interleaved);
var audioBlob = new Blob([dataview], { type: type });
self.postMessage({ command: 'exportWAV', data: audioBlob });
}
function getBuffer() {
var buffers = [];
for (var channel = 0; channel < numChannels; channel++) {
buffers.push(mergeBuffers(recBuffers[channel], recLength));
}
self.postMessage({ command: 'getBuffer', data: buffers });
}
function clear() {
recLength = 0;
recBuffers = [];
initBuffers();
}
function initBuffers() {
for (var channel = 0; channel < numChannels; channel++) {
recBuffers[channel] = [];
}
}
function mergeBuffers(recBuffers, recLength) {
var result = new Float32Array(recLength);
var offset = 0;
for (var i = 0; i < recBuffers.length; i++) {
result.set(recBuffers[i], offset);
offset += recBuffers[i].length;
}
return result;
}
function interleave(inputL, inputR) {
var length = inputL.length + inputR.length;
var result = new Float32Array(length);
var index = 0,
inputIndex = 0;
while (index < length) {
result[index++] = inputL[inputIndex];
result[index++] = inputR[inputIndex];
inputIndex++;
}
return result;
}
function extractSingleChannel(input) {
//如果此处不按比例缩短，实际输出的文件会包含sampleStep倍长度的空录音
var length = Math.ceil(input.length / 1);
var result = new Float32Array(length);
var index = 0,
inputIndex = 0;
while (index < length) {
//此处是处理关键，算法就是输入的数据点每隔sampleStep距离取一个点放入result
result[index++] = input[inputIndex];
inputIndex += 1;
}
return result;
}
function floatTo16BitPCM(output, offset, input) {
for (var i = 0; i < input.length; i++, offset += 2) {
var s = Math.max(-1, Math.min(1, input[i]));
output.setInt16(offset, s < 0 ? s * 0x8000 : s * 0x7FFF, true);
}
}
function writeString(view, offset, string) {
for (var i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
function encodeWAV(samples) {
var buffer = new ArrayBuffer(44 + samples.length * 2);
var view = new DataView(buffer);
/* RIFF identifier */
writeString(view, 0, 'RIFF');
/* RIFF chunk length */
view.setUint32(4, 36 + samples.length * 2, true);
/* RIFF type */
writeString(view, 8, 'WAVE');
/* format chunk identifier */
writeString(view, 12, 'fmt ');
/* format chunk length */
view.setUint32(16, 16, true);
/* sample format (raw) */
view.setUint16(20, 1, true);
/* channel count */
view.setUint16(22, numChannels, true);
/* sample rate */
view.setUint32(24, sampleRate, true);
/* byte rate (sample rate * block align) */
view.setUint32(28, sampleRate * 4, true);
/* block align (channel count * bytes per sample) */
view.setUint16(32, numChannels * 2, true);
/* bits per sample */
view.setUint16(34, 16, true);
/* data chunk identifier */
writeString(view, 36, 'data');
/* data chunk length */
view.setUint32(40, samples.length * 2, true);
floatTo16BitPCM(view, 44, samples);
return view;
}
}, self);
this.worker.postMessage({
command: 'init',
config: {
sampleRate: this.context.sampleRate,
numChannels: this.config.numChannels
}
});
this.worker.onmessage = function (e) {
var cb = _this.callbacks[e.data.command].pop();
if (typeof cb == 'function') {
cb(e.data.data);
}
};
}
_createClass(Recorder, [{
key: 'record',
value: function record() {
this.recording = true;
}
}, {
key: 'stop',
value: function stop() {
this.recording = false;
}
}, {
key: 'clear',
value: function clear() {
this.worker.postMessage({ command: 'clear' });
}
}, {
key: 'getBuffer',
value: function getBuffer(cb) {
cb = cb || this.config.callback;
if (!cb) throw new Error('Callback not set');
this.callbacks.getBuffer.push(cb);
this.worker.postMessage({ command: 'getBuffer' });
}
}, {
key: 'exportWAV',
value: function exportWAV(cb, mimeType) {
mimeType = mimeType || this.config.mimeType;
cb = cb || this.config.callback;
if (!cb) throw new Error('Callback not set');
this.callbacks.exportWAV.push(cb);
this.worker.postMessage({
command: 'exportWAV',
type: mimeType
});
}
}], [{
key: 'forceDownload',
value: function forceDownload(blob, filename) {
var url = (window.URL || window.webkitURL).createObjectURL(blob);
var link = window.document.createElement('a');
link.href = url;
link.download = filename || 'output.wav';
var click = document.createEvent("Event");
click.initEvent("click", true, true);
link.dispatchEvent(click);
}
}]);
return Recorder;
})();
exports.default = Recorder;
}, { "inline-worker": 3 }], 3: [function (require, module, exports) {
"use strict";
module.exports = require("./inline-worker");
}, { "./inline-worker": 4 }], 4: [function (require, module, exports) {
(function (global) {
"use strict";
var _createClass = (function () { function defineProperties(target, props) { for (var key in props) { var prop = props[key]; prop.configurable = true; if (prop.value) prop.writable = true; } Object.defineProperties(target, props); } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; })();
var _classCallCheck = function (instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } };
var WORKER_ENABLED = !!(global === global.window && global.URL && global.Blob && global.Worker);
var InlineWorker = (function () {
function InlineWorker(func, self) {
var _this = this;
_classCallCheck(this, InlineWorker);
if (WORKER_ENABLED) {
var functionBody = func.toString().trim().match(/^function\s*\w*\s*\([\w\s,]*\)\s*{([\w\W]*?)}$/)[1];
var url = global.URL.createObjectURL(new global.Blob([functionBody], { type: "text/javascript" }));
return new global.Worker(url);
}
this.self = self;
this.self.postMessage = function (data) {
setTimeout(function () {
_this.onmessage({ data: data });
}, 0);
};
setTimeout(function () {
func.call(self);
}, 0);
}
_createClass(InlineWorker, {
postMessage: {
value: function postMessage(data) {
var _this = this;
setTimeout(function () {
_this.self.onmessage({ data: data });
}, 0);
}
}
});
return InlineWorker;
})();
module.exports = InlineWorker;
}).call(this, typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {})
}, {}]
}, {}, [1])(1)
});

复制上面的代码，引入页面中

前端的逻辑

这里面注意
var audio_context = new AudioContext({ sampleRate: 16000 });//音频内容对象这句代码就可以了，不要修改，与上文的Recoeder.js中的逻辑是对应的，这样采样出来的音频文件才是符合百度要求的。直接拷贝代码就能用。通过语音控制人脸识别等操作，只需要再1写一下逻辑即可，前端判断一下某些关键词是否存在，从而触发某些方法，示例代码中触发的是“手势识别”，弄了一个定时器，点击开始录音，5s后自动进行语音识别操作，更加智能一些。

<script type="text/javascript">
var reco = null;
// var audio_context = new AudioContext();//音频内容对象
navigator.getUserMedia = (navigator.getUserMedia ||
navigator.webkitGetUserMedia ||
navigator.mozGetUserMedia ||
navigator.msGetUserMedia); // 兼容其他浏览器
navigator.getUserMedia({ audio: true }, create_stream, function (err) {
console.log(err)
});
function create_stream(user_media) {
//这里写死sampleRate: 16000
var audio_context = new AudioContext({ sampleRate: 16000 });//音频内容对象
var stream_input = audio_context.createMediaStreamSource(user_media);
reco = new Recorder(stream_input, {
numChannels: 1
});
}
var clock = '';
function start_reco() {
reco.record();
clock = setInterval(ai_reco, 5000)
console.log("666")
}
function ai_reco() {
reco.stop();
clearInterval(clock);
reco.exportWAV(function (wav_file) {
console.log(wav_file);
var formdata = new FormData(); // form 表单 {key:value}
formdata.append("audio", wav_file); // form input type="file"
$.ajax({
url: "/Recorder/RecorderVoice",
type: 'post',
processData: false,
contentType: false,
data: formdata,
dataType: 'json',
success: function (jsonStr) {
var data = JSON.stringify(jsonStr);
if (data.search("手势识别") != -1) {
$("#btn_rcon").click();
}
$("#voiceText").html("语音识别结果：" + data);
//  document.getElementById("player").src = "/get_audio/" + data.filename;
}
})
});
reco.clear();
}
</script>

后端逻辑

这里没啥讲的了，注意一下去官方开通相应的语音服务，填入对应的密匙，id等参数，还要开通普通话api

 public IActionResult RecorderVoice([FromForm] IFormFile audio)
{
string appId = _configuration["BaiduAiConfig:BaiDuLanguage:AppId_Language"];
string apiKey = _configuration["BaiduAiConfig:BaiDuLanguage:ApiKey_Language"];
string secertKey = _configuration["BaiduAiConfig:BaiDuLanguage:SecertKey_Language"];
var client = new Baidu.Aip.Speech.Asr(appId, apiKey, secertKey);
client.Timeout = 60000;  // 修改超时时间
string filename = Path.Combine("wwwroot/files", Guid.NewGuid().ToString().Substring(0, 6) + ".wav");
using
(FileStream fs = System.IO.File.Create(filename))
{
audio.CopyTo(fs);
fs.Flush();
}
FileStream filestream = new FileStream(filename, FileMode.Open);
byte[] arr = new byte[filestream.Length];
filestream.Read(arr, 0, (int)filestream.Length);
filestream.Close();
// 可选参数
var options = new Dictionary<string, object>
{
{"dev_pid", 1537}
//  {"dev_pid",1737 }
};
client.Timeout = 120000; // 若语音较长，建议设置更大的超时时间. ms
var result = client.Recognize(arr, "wav", 16000, options);
if (int.Parse(result["err_no"].ToString()) == 0 && result["err_msg"].ToString() == "success.")
{
return Json(result["result"][0].ToString());
}
return Json("Erro");
}
}

结语

就到这吧，需要完整可运行代码，关注私信吧。。。继续搬砖了。。。

本文地址：https://blog.csdn.net/weixin_41372626/article/details/109637357

黄山市民网：https://www.huangshanshimin.com/

前言