您好,登錄后才能下訂單哦!
本篇內(nèi)容介紹了“proc sql語句在數(shù)據(jù)清洗中的運(yùn)用”的有關(guān)知識(shí),在實(shí)際案例的操作過程中,不少人都會(huì)遇到這樣的困境,接下來就讓小編帶領(lǐng)大家學(xué)習(xí)一下如何處理這些情況吧!希望大家仔細(xì)閱讀,能夠?qū)W有所成!
libname clean "c:/books/clean"; /*定義永久性數(shù)據(jù)庫*/
*新建一個(gè)樣本數(shù)據(jù)one;
data one;
input X Y Z;
datalines;
1 2 3
101 202 303
44 55 66
444 555 666
;
title "Values of X from data set ONE where X is greater than 100";
/*有條件的選擇觀測(cè)值*/
proc sql;
select X
from one
where X gt 100;
quit;
*Program 8-2;
***檢查無效的字符型的數(shù)值;
title "Checking for Invalid Character Data";
proc sql;
select Patno,
Gender,
DX,
AE
from clean.patients
where Gender not in ('M','F',' ') or
notdigit(trim(DX))and not missing(DX) or
AE not in ('0','1',' ');
quit;
*檢查無效的數(shù)值型的數(shù)值;
title "Checking for out-of-range numeric values";
proc sql;
select Patno,
HR,
SBP,
DBP
from clean.patients
where HR not between 40 and 100 and not missing(HR) or
SBP not between 80 and 200 and not missing(SBP) or
DBP not between 60 and 120 and not missing(DBP);
quit;
*基于標(biāo)準(zhǔn)差利用簡(jiǎn)單的算法來檢查數(shù)值;
title "Data values beyond two standard deviations";
proc sql;
select Patno,
SBP
from clean.patients
having SBP not between mean(SBP) - 2 * std(SBP) and
mean(SBP) + 2 * std(SBP) and
SBP is not missing;
quit;
*檢查缺失值;
options linesize=84;
title "Observations with missing values";
proc sql;
select *
from clean.patients
where Patno is missing or
Gender is missing or
Visit is missing or
HR is missing or
SBP is missing or
DBP is missing or
DX is missing or
AE is missing;
quit;
*檢查日期;
title "Dates before June 1, 1998 or after October 15, 1999";
proc sql;
select Patno,
Visit
from clean.patients
where Visit not between '01jun1998'd and '15oct1999'd and
Visit is not missing;
quit;
*檢查重復(fù)值;
title "Duplicate Patient Numbers";
proc sql;
select Patno,
Visit
from clean.patients
group by Patno
having count(Patno) gt 1;
quit;
*識(shí)別對(duì)應(yīng)多個(gè)觀察值的變量;
title "Listing of patients who do not have two visits";
proc sql;
select Patno,
Visit
from clean.patients2
group by Patno
having count(Patno) ne 2;
quit;
*檢查兩個(gè)文件中對(duì)應(yīng)要求的序列號(hào)ID;
data one;
input Patno X Y;
datalines;
1 69 79
2 56 .
3 66 99
5 98 87
12 13 14
;
data two;
input Patno Z;
datalines;
1 56
3 67
4 88
5 98
13 99
;
*兩個(gè)文件都不含的ID;
title "Patient numbers not in both files";
proc sql;
select One.patno as ID_one,
Two.patno as ID_two
from one full join two
on One.patno eq Two.patno
where One.patno is missing or Two.patno is missing;
quit;
“proc sql語句在數(shù)據(jù)清洗中的運(yùn)用”的內(nèi)容就介紹到這里了,感謝大家的閱讀。如果想了解更多行業(yè)相關(guān)的知識(shí)可以關(guān)注億速云網(wǎng)站,小編將為大家輸出更多高質(zhì)量的實(shí)用文章!
免責(zé)聲明:本站發(fā)布的內(nèi)容(圖片、視頻和文字)以原創(chuàng)、轉(zhuǎn)載和分享為主,文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng),如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱:is@yisu.com進(jìn)行舉報(bào),并提供相關(guān)證據(jù),一經(jīng)查實(shí),將立刻刪除涉嫌侵權(quán)內(nèi)容。